In [17]:
# Import the necessary libraries
import pandas as pd
import json
import requests
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt

In [18]:
# Read the CSV file
pd.options.display.max_colwidth = 200
country_df = pd.read_csv("Data/GlobalLandTemperaturesByCountry.csv")
country_df

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country
0,1743-11-01,4.384,2.294,Åland
1,1743-12-01,,,Åland
2,1744-01-01,,,Åland
3,1744-02-01,,,Åland
4,1744-03-01,,,Åland
...,...,...,...,...
577457,2013-05-01,19.059,1.022,Zimbabwe
577458,2013-06-01,17.613,0.473,Zimbabwe
577459,2013-07-01,17.000,0.453,Zimbabwe
577460,2013-08-01,19.759,0.717,Zimbabwe


In [19]:
# Using 'to_list()' converts the column labels to a Python list
columns = country_df.columns.to_list()
columns

['dt', 'AverageTemperature', 'AverageTemperatureUncertainty', 'Country']

In [20]:
# Checck data types
country_df.dtypes

dt                                object
AverageTemperature               float64
AverageTemperatureUncertainty    float64
Country                           object
dtype: object

In [21]:
# Convert to date time format
country_df["dt"] = pd.to_datetime(country_df["dt"])

In [22]:
# Check the data types again
country_df.dtypes

dt                               datetime64[ns]
AverageTemperature                      float64
AverageTemperatureUncertainty           float64
Country                                  object
dtype: object

In [33]:
# Determine the number of unique countries in the dataset
country_df["Country"].unique()
len(country_df["Country"].unique())

243

In [38]:
countries_grouped = country_df.groupby("Country")
countries_grouped.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country
0,1743-11-01,4.384,2.294,Åland
1,1743-12-01,,,Åland
2,1744-01-01,,,Åland
3,1744-02-01,,,Åland
4,1744-03-01,,,Åland
...,...,...,...,...
575497,1850-01-01,22.187,1.618,Zimbabwe
575498,1850-02-01,23.538,1.635,Zimbabwe
575499,1850-03-01,22.528,2.195,Zimbabwe
575500,1850-04-01,20.000,1.817,Zimbabwe


In [47]:
# Find the earliest date all countries have in common

earliest_date = countries_grouped["dt"].min().max()
earliest_date

'1950-02-01'

In [48]:
# Find the latest date all countries have in common

latest_date = countries_grouped["dt"].max().min()
latest_date

'2013-09-01'

In [51]:
date_range = (f"Our date range is {earliest_date} to {latest_date}")
print(date_range)

Our date range is 1950-02-01 to 2013-09-01


In [56]:
cleaned_countries = country_df[country_df['dt']>=earliest_date]
cleaned_countries.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country
2475,1950-02-01,-1.804,0.305,Åland
2476,1950-03-01,0.09,0.439,Åland
2477,1950-04-01,3.63,0.342,Åland
2478,1950-05-01,8.106,0.239,Åland
2479,1950-06-01,12.783,0.235,Åland


In [58]:
cleaned_countries_grouped = cleaned_countries.groupby(country_df["Country"])
cleaned_countries_grouped.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country
2475,1950-02-01,-1.804,0.305,Åland
2476,1950-03-01,0.090,0.439,Åland
2477,1950-04-01,3.630,0.342,Åland
2478,1950-05-01,8.106,0.239,Åland
2479,1950-06-01,12.783,0.235,Åland
...,...,...,...,...
576698,1950-02-01,24.089,0.296,Zimbabwe
576699,1950-03-01,23.642,0.444,Zimbabwe
576700,1950-04-01,22.258,0.369,Zimbabwe
576701,1950-05-01,17.536,0.264,Zimbabwe


In [63]:
cleaned_countries_grouped_dropna = cleaned_countries.dropna()
cleaned_countries_grouped_dropna

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country
2475,1950-02-01,-1.804,0.305,Åland
2476,1950-03-01,0.090,0.439,Åland
2477,1950-04-01,3.630,0.342,Åland
2478,1950-05-01,8.106,0.239,Åland
2479,1950-06-01,12.783,0.235,Åland
...,...,...,...,...
577456,2013-04-01,21.142,0.495,Zimbabwe
577457,2013-05-01,19.059,1.022,Zimbabwe
577458,2013-06-01,17.613,0.473,Zimbabwe
577459,2013-07-01,17.000,0.453,Zimbabwe


In [65]:
final_df = cleaned_countries_grouped_dropna.groupby(country_df["Country"])
final_df.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country
2475,1950-02-01,-1.804,0.305,Åland
2476,1950-03-01,0.090,0.439,Åland
2477,1950-04-01,3.630,0.342,Åland
2478,1950-05-01,8.106,0.239,Åland
2479,1950-06-01,12.783,0.235,Åland
...,...,...,...,...
576698,1950-02-01,24.089,0.296,Zimbabwe
576699,1950-03-01,23.642,0.444,Zimbabwe
576700,1950-04-01,22.258,0.369,Zimbabwe
576701,1950-05-01,17.536,0.264,Zimbabwe


In [None]:
final_df.to_csv("Countries.csv")

In [15]:
pd.options.display.max_colwidth = 200
# Extract the ZIP file manually as pd.read_csv function did not work due to compatibility issue
import zipfile
with zipfile.ZipFile("Data/GlobalLandTemperaturesByCity.csv.zip", 'r') as zip_ref:
    zip_ref.extractall("Data/")

# Read the CSV file directly
city_df = pd.read_csv("Data/GlobalLandTemperaturesByCity.csv")

city_df

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1743-11-01,6.068,1.737,Århus,Denmark,57.05N,10.33E
1,1743-12-01,,,Århus,Denmark,57.05N,10.33E
2,1744-01-01,,,Århus,Denmark,57.05N,10.33E
3,1744-02-01,,,Århus,Denmark,57.05N,10.33E
4,1744-03-01,,,Århus,Denmark,57.05N,10.33E
...,...,...,...,...,...,...,...
8599207,2013-05-01,11.464,0.236,Zwolle,Netherlands,52.24N,5.26E
8599208,2013-06-01,15.043,0.261,Zwolle,Netherlands,52.24N,5.26E
8599209,2013-07-01,18.775,0.193,Zwolle,Netherlands,52.24N,5.26E
8599210,2013-08-01,18.025,0.298,Zwolle,Netherlands,52.24N,5.26E


In [18]:
# Converts the column labels to a Python list
city_columns = list(city_df)
city_columns

['dt',
 'AverageTemperature',
 'AverageTemperatureUncertainty',
 'City',
 'Country',
 'Latitude',
 'Longitude']

In [19]:
# Checck data types
city_df.dtypes

dt                                object
AverageTemperature               float64
AverageTemperatureUncertainty    float64
City                              object
Country                           object
Latitude                          object
Longitude                         object
dtype: object

In [22]:
# Convert to date time format
city_df["dt"] = pd.to_datetime(city_df["dt"])

In [23]:
# Check the data types again
city_df.dtypes

dt                               datetime64[ns]
AverageTemperature                      float64
AverageTemperatureUncertainty           float64
City                                     object
Country                                  object
Latitude                                 object
Longitude                                object
dtype: object

In [24]:
# Determine the number of unique countries in the dataset
city_df["Country"].unique()
len(city_df["Country"].unique())

159

In [25]:
# Determine the number of unique cities in the dataset
city_df["City"].unique()
len(city_df["City"].unique())

3448

In [45]:
# Group by 'Year', 'City', and 'Country' and calculate the mean temperature for each year
yearly_avg_city_df = city_df.groupby(['dt', 'City', 'Country'])['AverageTemperature'].mean().reset_index()

# Rename the columns
yearly_avg_city_df = yearly_avg_city_df.rename(columns={'dt': 'Year', 'AverageTemperature': 'Yearly Average Temperature'})

# Display the DataFrame with the average temperature for each year
(yearly_avg_city_df)


Unnamed: 0,Year,City,Country,Yearly Average Temperature
0,1743-11-01,A Coruña,Spain,10.779
1,1743-11-01,Aachen,Germany,6.425
2,1743-11-01,Aalborg,Denmark,6.068
3,1743-11-01,Aberdeen,United Kingdom,8.758
4,1743-11-01,Aix En Provence,France,7.478
...,...,...,...,...
8553173,2013-09-01,Århus,Denmark,
8553174,2013-09-01,Çorlu,Turkey,
8553175,2013-09-01,Çorum,Turkey,
8553176,2013-09-01,Öskemen,Kazakhstan,


In [49]:
cleaned_cities = yearly_avg_city_df[yearly_avg_city_df['Year']>="1950-02-01"]
cleaned_cities

Unnamed: 0,Year,City,Country,Yearly Average Temperature
5886818,1950-02-01,A Coruña,Spain,10.035
5886819,1950-02-01,Aachen,Germany,4.661
5886820,1950-02-01,Aalborg,Denmark,1.423
5886821,1950-02-01,Aba,Nigeria,27.105
5886822,1950-02-01,Abadan,Iran,11.975
...,...,...,...,...
8553173,2013-09-01,Århus,Denmark,
8553174,2013-09-01,Çorlu,Turkey,
8553175,2013-09-01,Çorum,Turkey,
8553176,2013-09-01,Öskemen,Kazakhstan,


In [56]:
final_city_df = cleaned_cities.dropna()
final_city_df

Unnamed: 0,Year,City,Country,Yearly Average Temperature
5886818,1950-02-01,A Coruña,Spain,10.035
5886819,1950-02-01,Aachen,Germany,4.661
5886820,1950-02-01,Aalborg,Denmark,1.423
5886821,1950-02-01,Aba,Nigeria,27.105
5886822,1950-02-01,Abadan,Iran,11.975
...,...,...,...,...
8553051,2013-09-01,Xico,Mexico,18.313
8553105,2013-09-01,Yonkers,United States,17.408
8553120,2013-09-01,Zacatecas,Mexico,18.830
8553125,2013-09-01,Zamora,Mexico,21.155


In [58]:
# final_city_df = cleaned_cities_dropna.groupby(['City', 'Country']).reset_index()
final_city_df.to_csv("Cleaned Data/Cities.csv", index=False)