# COUNTRIES

In [39]:
# Import the necessary libraries
import pandas as pd
import json
import requests
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt

In [2]:
# Read the CSV file
pd.options.display.max_colwidth = 200
country_df = pd.read_csv("Data/GlobalLandTemperaturesByCountry.csv")
country_df

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country
0,1743-11-01,4.384,2.294,Åland
1,1743-12-01,,,Åland
2,1744-01-01,,,Åland
3,1744-02-01,,,Åland
4,1744-03-01,,,Åland
...,...,...,...,...
577457,2013-05-01,19.059,1.022,Zimbabwe
577458,2013-06-01,17.613,0.473,Zimbabwe
577459,2013-07-01,17.000,0.453,Zimbabwe
577460,2013-08-01,19.759,0.717,Zimbabwe


In [3]:
# Using 'to_list()' converts the column labels to a Python list
columns = country_df.columns.to_list()
columns

['dt', 'AverageTemperature', 'AverageTemperatureUncertainty', 'Country']

In [4]:
# Checck data types
country_df.dtypes

dt                                object
AverageTemperature               float64
AverageTemperatureUncertainty    float64
Country                           object
dtype: object

In [5]:
# Convert to date time format
country_df["dt"] = pd.to_datetime(country_df["dt"])

In [6]:
# Check the data types again
country_df.dtypes

dt                               datetime64[ns]
AverageTemperature                      float64
AverageTemperatureUncertainty           float64
Country                                  object
dtype: object

In [7]:
# Determine the number of unique countries in the dataset
country_df["Country"].unique()
len(country_df["Country"].unique())

243

In [41]:
# Group by 'Year', 'City', and 'Country' and calculate the mean temperature for each year
country_df = country_df.groupby(['dt', 'Country'])['AverageTemperature'].mean().reset_index()

# Rename the columns
country_df = country_df.rename(columns={'dt': 'Year', 'AverageTemperature': 'Yearly Average Temperature'})

# Display the DataFrame with the average temperature for each year
country_df.head()

Unnamed: 0,Year,Country,Yearly Average Temperature
0,1743-11-01,Albania,8.62
1,1743-11-01,Andorra,7.556
2,1743-11-01,Austria,2.482
3,1743-11-01,Belarus,0.767
4,1743-11-01,Belgium,7.106


In [42]:
date_range = (f"Our date range is {earliest_date} to {latest_date}")
print(date_range)

Our date range is 1950-02-01 00:00:00 to 2013-09-01 00:00:00


In [44]:
cleaned_countries = country_df[country_df['Year']>=earliest_date]
cleaned_countries.head()

Unnamed: 0,Year,Country,Yearly Average Temperature
391810,1950-02-01,Afghanistan,0.159
391811,1950-02-01,Africa,21.948
391812,1950-02-01,Albania,5.806
391813,1950-02-01,Algeria,14.176
391814,1950-02-01,American Samoa,26.772


In [47]:
final_df = cleaned_countries.dropna()
final_df.head()

Unnamed: 0,Year,Country,Yearly Average Temperature
391810,1950-02-01,Afghanistan,0.159
391811,1950-02-01,Africa,21.948
391812,1950-02-01,Albania,5.806
391813,1950-02-01,Algeria,14.176
391814,1950-02-01,American Samoa,26.772


In [49]:
final_df.to_csv("Cleaned Data/Countries.csv")

# STATES 

In [17]:
state_df = pd.read_csv("Data/GlobalLandTemperaturesByState.csv")
state_df

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,State,Country
0,1855-05-01,25.544,1.171,Acre,Brazil
1,1855-06-01,24.228,1.103,Acre,Brazil
2,1855-07-01,24.371,1.044,Acre,Brazil
3,1855-08-01,25.427,1.073,Acre,Brazil
4,1855-09-01,25.675,1.014,Acre,Brazil
...,...,...,...,...,...
645670,2013-05-01,21.634,0.578,Zhejiang,China
645671,2013-06-01,24.679,0.596,Zhejiang,China
645672,2013-07-01,29.272,1.340,Zhejiang,China
645673,2013-08-01,29.202,0.869,Zhejiang,China


In [18]:
pd.options.display.max_colwidth = 200
state_df.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,State,Country
0,1855-05-01,25.544,1.171,Acre,Brazil
1,1855-06-01,24.228,1.103,Acre,Brazil
2,1855-07-01,24.371,1.044,Acre,Brazil
3,1855-08-01,25.427,1.073,Acre,Brazil
4,1855-09-01,25.675,1.014,Acre,Brazil


In [19]:
columns = state_df.columns.to_list()
# Or, you can use: columns = list(orders_df)
columns

['dt',
 'AverageTemperature',
 'AverageTemperatureUncertainty',
 'State',
 'Country']

In [20]:
state_df.dtypes

dt                                object
AverageTemperature               float64
AverageTemperatureUncertainty    float64
State                             object
Country                           object
dtype: object

In [21]:
state_df["dt"] = pd.to_datetime(state_df["dt"])

In [22]:
state_df.dtypes

dt                               datetime64[ns]
AverageTemperature                      float64
AverageTemperatureUncertainty           float64
State                                    object
Country                                  object
dtype: object

In [23]:
state_df["State"].unique()
len(state_df["State"].unique())

241

In [24]:
# Group by 'Year', 'City', and 'Country' and calculate the mean temperature for each year
yearly_avg_state_df = state_df.groupby(['dt', 'State', 'Country'])['AverageTemperature'].mean().reset_index()

# Rename the columns
yearly_avg_state_df = yearly_avg_state_df.rename(columns={'dt': 'Year', 'AverageTemperature': 'Yearly Average Temperature'})

# Display the DataFrame with the average temperature for each year
yearly_avg_state_df.head()

Unnamed: 0,Year,State,Country,Yearly Average Temperature
0,1743-11-01,Adygey,Russia,4.537
1,1743-11-01,Alabama,United States,10.722
2,1743-11-01,Arkhangel'Sk,Russia,-8.008
3,1743-11-01,Belgorod,Russia,-0.91
4,1743-11-01,Bryansk,Russia,-0.707


In [25]:
cleaned_states = yearly_avg_state_df[yearly_avg_state_df['Year']>="1950-02-01"]
cleaned_states

Unnamed: 0,Year,State,Country,Yearly Average Temperature
461551,1950-02-01,Acre,Brazil,25.973
461552,1950-02-01,Adygey,Russia,-2.054
461553,1950-02-01,Aga Buryat,Russia,-18.944
461554,1950-02-01,Alabama,United States,11.922
461555,1950-02-01,Alagoas,Brazil,26.727
...,...,...,...,...
645670,2013-09-01,Yaroslavl',Russia,
645671,2013-09-01,Yevrey,Russia,
645672,2013-09-01,Yukon,Canada,5.267
645673,2013-09-01,Yunnan,China,


In [26]:
cleaned_states_grouped_dropna = cleaned_states.dropna()
cleaned_states_grouped_dropna

Unnamed: 0,Year,State,Country,Yearly Average Temperature
461551,1950-02-01,Acre,Brazil,25.973
461552,1950-02-01,Adygey,Russia,-2.054
461553,1950-02-01,Aga Buryat,Russia,-18.944
461554,1950-02-01,Alabama,United States,11.922
461555,1950-02-01,Alagoas,Brazil,26.727
...,...,...,...,...
645661,2013-09-01,Washington,United States,16.823
645663,2013-09-01,West Virginia,United States,18.708
645665,2013-09-01,Wisconsin,United States,16.567
645666,2013-09-01,Wyoming,United States,15.811


In [27]:
# Now you can use to_csv on the resulting DataFrame
cleaned_states_grouped_dropna.to_csv("Cleaned Data/States.csv")

# CITIES 

In [28]:
pd.options.display.max_colwidth = 200
# Extract the ZIP file manually as pd.read_csv function did not work due to compatibility issue
import zipfile
with zipfile.ZipFile("Data/GlobalLandTemperaturesByCity.csv.zip", 'r') as zip_ref:
    zip_ref.extractall("Data/")

# Read the CSV file directly
city_df = pd.read_csv("Data/GlobalLandTemperaturesByCity.csv")

city_df

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1743-11-01,6.068,1.737,Århus,Denmark,57.05N,10.33E
1,1743-12-01,,,Århus,Denmark,57.05N,10.33E
2,1744-01-01,,,Århus,Denmark,57.05N,10.33E
3,1744-02-01,,,Århus,Denmark,57.05N,10.33E
4,1744-03-01,,,Århus,Denmark,57.05N,10.33E
...,...,...,...,...,...,...,...
8599207,2013-05-01,11.464,0.236,Zwolle,Netherlands,52.24N,5.26E
8599208,2013-06-01,15.043,0.261,Zwolle,Netherlands,52.24N,5.26E
8599209,2013-07-01,18.775,0.193,Zwolle,Netherlands,52.24N,5.26E
8599210,2013-08-01,18.025,0.298,Zwolle,Netherlands,52.24N,5.26E


In [29]:
# Converts the column labels to a Python list
city_columns = list(city_df)
city_columns

['dt',
 'AverageTemperature',
 'AverageTemperatureUncertainty',
 'City',
 'Country',
 'Latitude',
 'Longitude']

In [30]:
# Checck data types
city_df.dtypes

dt                                object
AverageTemperature               float64
AverageTemperatureUncertainty    float64
City                              object
Country                           object
Latitude                          object
Longitude                         object
dtype: object

In [31]:
# Convert to date time format
city_df["dt"] = pd.to_datetime(city_df["dt"])

In [32]:
# Check the data types again
city_df.dtypes

dt                               datetime64[ns]
AverageTemperature                      float64
AverageTemperatureUncertainty           float64
City                                     object
Country                                  object
Latitude                                 object
Longitude                                object
dtype: object

In [33]:
# Determine the number of unique countries in the dataset
city_df["Country"].unique()
len(city_df["Country"].unique())

159

In [34]:
# Determine the number of unique cities in the dataset
city_df["City"].unique()
len(city_df["City"].unique())

3448

In [35]:
# Group by 'Year', 'City', and 'Country' and calculate the mean temperature for each year
yearly_avg_city_df = city_df.groupby(['dt', 'City', 'Country'])['AverageTemperature'].mean().reset_index()

# Rename the columns
yearly_avg_city_df = yearly_avg_city_df.rename(columns={'dt': 'Year', 'AverageTemperature': 'Yearly Average Temperature'})

# Display the DataFrame with the average temperature for each year
(yearly_avg_city_df)


Unnamed: 0,Year,City,Country,Yearly Average Temperature
0,1743-11-01,A Coruña,Spain,10.779
1,1743-11-01,Aachen,Germany,6.425
2,1743-11-01,Aalborg,Denmark,6.068
3,1743-11-01,Aberdeen,United Kingdom,8.758
4,1743-11-01,Aix En Provence,France,7.478
...,...,...,...,...
8553173,2013-09-01,Århus,Denmark,
8553174,2013-09-01,Çorlu,Turkey,
8553175,2013-09-01,Çorum,Turkey,
8553176,2013-09-01,Öskemen,Kazakhstan,


In [36]:
cleaned_cities = yearly_avg_city_df[yearly_avg_city_df['Year']>="1950-02-01"]
cleaned_cities

Unnamed: 0,Year,City,Country,Yearly Average Temperature
5886818,1950-02-01,A Coruña,Spain,10.035
5886819,1950-02-01,Aachen,Germany,4.661
5886820,1950-02-01,Aalborg,Denmark,1.423
5886821,1950-02-01,Aba,Nigeria,27.105
5886822,1950-02-01,Abadan,Iran,11.975
...,...,...,...,...
8553173,2013-09-01,Århus,Denmark,
8553174,2013-09-01,Çorlu,Turkey,
8553175,2013-09-01,Çorum,Turkey,
8553176,2013-09-01,Öskemen,Kazakhstan,


In [37]:
final_city_df = cleaned_cities.dropna()
final_city_df

Unnamed: 0,Year,City,Country,Yearly Average Temperature
5886818,1950-02-01,A Coruña,Spain,10.035
5886819,1950-02-01,Aachen,Germany,4.661
5886820,1950-02-01,Aalborg,Denmark,1.423
5886821,1950-02-01,Aba,Nigeria,27.105
5886822,1950-02-01,Abadan,Iran,11.975
...,...,...,...,...
8553051,2013-09-01,Xico,Mexico,18.313
8553105,2013-09-01,Yonkers,United States,17.408
8553120,2013-09-01,Zacatecas,Mexico,18.830
8553125,2013-09-01,Zamora,Mexico,21.155


In [38]:
# final_city_df = cleaned_cities_dropna.groupby(['City', 'Country']).reset_index()
final_city_df.to_csv("Cleaned Data/Cities.csv", index=False)