In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Cities DataFrame

In [2]:
cities_df = pd.read_csv('./weatherData/cities.csv')
cities_df.head()

Unnamed: 0,station_id,city_name,country,state,iso2,iso3,latitude,longitude
0,41515,Asadabad,Afghanistan,Kunar,AF,AFG,34.866,71.150005
1,38954,Fayzabad,Afghanistan,Badakhshan,AF,AFG,37.129761,70.579247
2,41560,Jalalabad,Afghanistan,Nangarhar,AF,AFG,34.441527,70.436103
3,38947,Kunduz,Afghanistan,Kunduz,AF,AFG,36.727951,68.87253
4,38987,Qala i Naw,Afghanistan,Badghis,AF,AFG,34.983,63.1333


In [3]:
cities_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1245 entries, 0 to 1244
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   station_id  1245 non-null   object 
 1   city_name   1244 non-null   object 
 2   country     1245 non-null   object 
 3   state       1217 non-null   object 
 4   iso2        1239 non-null   object 
 5   iso3        1245 non-null   object 
 6   latitude    1245 non-null   float64
 7   longitude   1245 non-null   float64
dtypes: float64(2), object(6)
memory usage: 77.9+ KB


## Basic Statistics

In [4]:
# What is the average latitude and longitude of the cities in the dataset?
avg_lat = cities_df['latitude'].mean()
avg_lat = round(avg_lat, 2)

avg_lon = cities_df['longitude'].mean()
avg_lon = round(avg_lon, 2)

print(f"Average latitude is {avg_lat}, and the average longitude is {avg_lon}")

Average latitude is 23.73, and the average longitude is 20.78


In [5]:
# How many unique countries and states are represented in the dataset?
country_count = cities_df['country'].nunique()
state_count = cities_df['state'].nunique()

print(f"There are {country_count} countries and {state_count} states in the dataset")

There are 216 countries and 1149 states in the dataset


## Country Analysis

In [6]:
# How many cities are there in each country?
cities_df.groupby('country')['city_name'].count().sort_values(ascending=False)

country
Russia                      77
United States of America    49
Turkey                      44
Thailand                    38
India                       29
                            ..
Latvia                       1
Lebanon                      1
Cuba                         1
Oman                         1
Hong Kong                    1
Name: city_name, Length: 216, dtype: int64

In [7]:
# Group by country and count the number of weather stations
station_count_by_country = cities_df.groupby('country')['station_id'].count()

# Find the country with the most weather stations
country_with_most_stations = station_count_by_country.idxmax()
max_stations = station_count_by_country.max()

print(f"The country with the most weather stations is {country_with_most_stations} with {max_stations} stations.")


The country with the most weather stations is Russia with 77 stations.


# Countries DataFrame

## Population Analysis

In [8]:
population_df = pd.read_csv('./weatherData/countries.csv')
population_df.head()

Unnamed: 0,country,native_name,iso2,iso3,population,area,capital,capital_lat,capital_lng,region,continent
0,Afghanistan,افغانستان,AF,AFG,26023100.0,652230.0,Kabul,34.526011,69.177684,Southern and Central Asia,Asia
1,Albania,Shqipëria,AL,ALB,2895947.0,28748.0,Tirana,41.326873,19.818791,Southern Europe,Europe
2,Algeria,الجزائر,DZ,DZA,38700000.0,2381741.0,Algiers,36.775361,3.060188,Northern Africa,Africa
3,American Samoa,American Samoa,AS,ASM,55519.0,199.0,Pago Pago,-14.275479,-170.70483,Polynesia,Oceania
4,Angola,Angola,AO,AGO,24383301.0,1246700.0,Luanda,-8.82727,13.243951,Central Africa,Africa


In [9]:
population_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   country      214 non-null    object 
 1   native_name  213 non-null    object 
 2   iso2         213 non-null    object 
 3   iso3         214 non-null    object 
 4   population   210 non-null    float64
 5   area         207 non-null    float64
 6   capital      212 non-null    object 
 7   capital_lat  212 non-null    float64
 8   capital_lng  212 non-null    float64
 9   region       205 non-null    object 
 10  continent    206 non-null    object 
dtypes: float64(4), object(7)
memory usage: 18.5+ KB


In [10]:
# Find rows with missing values
population_df.isnull().sum()

country        0
native_name    1
iso2           1
iso3           0
population     4
area           7
capital        2
capital_lat    2
capital_lng    2
region         9
continent      8
dtype: int64

In [11]:
# Drop rows with missing values
population_df = population_df.dropna()
population_df.isnull().sum()

country        0
native_name    0
iso2           0
iso3           0
population     0
area           0
capital        0
capital_lat    0
capital_lng    0
region         0
continent      0
dtype: int64

In [12]:
# Which countries have the highest and lowest populations?
population_count_by_country = population_df.groupby('country')['population'].sum().sort_values(ascending=False)

# Find the country with the highest population
country_with_highest_population = population_count_by_country.idxmax()
max_population_count = population_count_by_country.max()

# Find the country with the lowest population
country_with_lowest_population = population_count_by_country.idxmin()
lowest_population_count = population_count_by_country.min()

print(f"{country_with_highest_population} has the highest population of {max_population_count} and the {country_with_lowest_population} has the lowest population of {lowest_population_count}" ) 

China has the highest population of 1367110000.0 and the French Southern and Antarctic Lands has the lowest population of 140.0


In [13]:
# Calculate the average population for each region and continent.
avg_population_by_region = population_df.groupby('region')['population'].mean()
avg_population_by_continent = population_df.groupby('continent')['population'].mean()

# Display the results
print("Average Population by Region:")
print(avg_population_by_region)

print("\nAverage Population by Continent:")
print(avg_population_by_continent)

Average Population by Region:
region
Antarctica                   1.400000e+02
Australia and New Zealand    1.517425e+06
Baltic Countries             2.078310e+06
British Isles                3.524183e+07
Caribbean                    2.720357e+06
Central Africa               9.838065e+06
Central America              2.312048e+07
Eastern Africa               2.050143e+07
Eastern Asia                 2.633106e+08
Eastern Europe               2.937403e+07
Melanesia                    2.275918e+06
Micronesia                   9.394700e+04
Middle East                  1.468841e+07
Nordic Countries             5.269666e+06
North America                7.098521e+07
Northern Africa              3.070632e+07
Polynesia                    9.184017e+04
South America                3.157992e+07
Southeast Asia               5.336937e+07
Southern Africa              1.480777e+07
Southern Europe              1.332821e+07
Southern and Central Asia    1.392465e+08
Western Africa               2.101353e+

## Geographic and Demographic Insights

In [14]:
# Find the country with the highest population density (population per square kilometer)
# Create a population_density column in the data frame
population_df['population_density'] = population_df['population'] / population_df['area']

# Get the country and corresponding maximum population density values
population_density = population_df.groupby('country')['population_density'].sum()
country_with_highest_population_density = population_density.idxmax()
highest_population_density = round(population_density.max(), 2)

print(f"{country_with_highest_population_density} has the highest population density of {highest_population_density}")


Monaco has the highest population density of 18292.08


In [15]:
# Determine the region with the most and least number of countries
country_count_by_region = population_df.groupby('region')['country'].count()

# Find 
region_with_least_num_of_countries = country_count_by_region.idxmin()
least_num_of_countries_count = country_count_by_region.min()

region_with_most_num_of_countries = country_count_by_region.idxmax()
most_num_of_countries_count = country_count_by_region.max()

print(f"{region_with_most_num_of_countries} has the most number of countries at {most_num_of_countries_count} while {region_with_least_num_of_countries} only has {least_num_of_countries_count}.")

Eastern Africa has the most number of countries at 17 while Antarctica only has 1.


# Daily Weather DataFrame

## Weather Trends

In [16]:
weather_df = pd.read_parquet('./weatherData/daily_weather.parquet')
weather_df.head()

Unnamed: 0,station_id,city_name,date,season,avg_temp_c,min_temp_c,max_temp_c,precipitation_mm,snow_depth_mm,avg_wind_dir_deg,avg_wind_speed_kmh,peak_wind_gust_kmh,avg_sea_level_pres_hpa,sunshine_total_min
0,41515,Asadabad,1957-07-01,Summer,27.0,21.1,35.6,0.0,,,,,,
1,41515,Asadabad,1957-07-02,Summer,22.8,18.9,32.2,0.0,,,,,,
2,41515,Asadabad,1957-07-03,Summer,24.3,16.7,35.6,1.0,,,,,,
3,41515,Asadabad,1957-07-04,Summer,26.6,16.1,37.8,4.1,,,,,,
4,41515,Asadabad,1957-07-05,Summer,30.8,20.0,41.7,0.0,,,,,,


In [17]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27635763 entries, 0 to 24220
Data columns (total 14 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   station_id              category      
 1   city_name               category      
 2   date                    datetime64[ns]
 3   season                  category      
 4   avg_temp_c              float64       
 5   min_temp_c              float64       
 6   max_temp_c              float64       
 7   precipitation_mm        float64       
 8   snow_depth_mm           float64       
 9   avg_wind_dir_deg        float64       
 10  avg_wind_speed_kmh      float64       
 11  peak_wind_gust_kmh      float64       
 12  avg_sea_level_pres_hpa  float64       
 13  sunshine_total_min      float64       
dtypes: category(3), datetime64[ns](1), float64(10)
memory usage: 2.6 GB


In [18]:
weather_df.isnull().sum()

station_id                       0
city_name                    13993
date                             0
season                           0
avg_temp_c                 6230907
min_temp_c                 5718229
max_temp_c                 5539346
precipitation_mm           6642500
snow_depth_mm             24208615
avg_wind_dir_deg          24183195
avg_wind_speed_kmh        22350295
peak_wind_gust_kmh        26514277
avg_sea_level_pres_hpa    23618606
sunshine_total_min        26614302
dtype: int64

In [19]:
# What is the average temperature, precipitation, and wind speed for a specific city?

In [20]:
# Identify the hottest and coldest days based on max_temp_c and min_temp_c