In [None]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as ex

%matplotlib inline
sns.set_style("darkgrid")

In [None]:
df = pd.read_csv("/kaggle/input/daily-temperature-of-major-cities/city_temperature.csv")

In [None]:
df.head()

### Looking At The Variables

> This dataset provides Average Temprature for different Region, Country, State and City. The is time series format is also there so there are multiple ways to look at it. I'll try to divide my notebook into three parts, Univariate, Bivariate and Multivariate data visualizations.

> This notebook won't provide answers to specific questions instead i'll try to create visualizations that'll help creating intuition about the data.

> Also i'm assuming unit of measure is Celsius. (Correct me if i'm wrong 😅)

**Data Types**

In [None]:
df.dtypes

**Null Values**

In [None]:
df.isna().sum()

In [None]:
df.describe()

> Looks like there's a small error in Year column, minimum value is 200. It may be 2000 so replacing it should be a not the best idea but it'll help the visualization process a bit.

In [None]:
print (f"Unique Years : {df.Year.unique()}")

> So there are two anomalies in year column. 200 and 201 which maybe year 2000 and 2010. So i'll replace them.

In [None]:
mask = df.Year == 200
df.loc[mask,"Year"] = 2000

mask = df.Year == 201
df.loc[mask,"Year"] = 2010

Another thing is day, which has minimum value of 0 i think it'a some sort of mistake so i'll replace it with 1.

In [None]:
df.Day.unique()

In [None]:
mask = df.Day == 0
df.loc[mask,"Day"] = 1

#### Univariate Exploration

> Starting with univariate exploration i'll try to create visualizations that will help us gain some abstract about the data.

In [None]:
print (f"Total Regions : {df.Region.nunique()}")
print (f"Total Countries : {df.Country.nunique()}")
print (f"Total Cities : {df.City.nunique()}")

print (f"\nWe have data of total {df.Year.nunique()} years starting from {df.Year.min()} to {df.Year.max()}.")
print (f"The temperature ranges from {df.AvgTemperature.min()} ᵒC to {df.AvgTemperature.max()} ᵒC")

### Counting

In [None]:
regions = df.Region.value_counts()

plt.figure(figsize=(12,4))
sns.barplot(regions.values,regions.index,color="#3498db")
plt.title("Data Amount From Various Regions");

> Most of the data comes from north america and and i assume it'll be from USA. And we have way less data for Austria / South Pacific.

In [None]:
countries = df.Country.value_counts()

plt.figure(figsize=(12,36))
sns.barplot(countries.values,countries.index,color="#3498db")
plt.title("Data Amount From Various Countries");

> As per my assumption the biasness in the data is mostly because of US. Also we have close to no data for Serbia-Montenegro.

> Now There are so many cities so instead of plotting a bar for every single city i'll plot only top 25 and bottom 25 cities from the data.

In [None]:
cities = df.City.value_counts().sort_values(ascending=False)
fig,axes = plt.subplots(2,1,figsize=(14,18))

ax = sns.barplot(cities.head(25).index,cities.head(25).values,color="#3498db",ax=axes[0])
ax.set_xticklabels(ax.get_xticklabels(),rotation=60)
ax = sns.barplot(cities.tail(25).index,cities.tail(25).values,color="#3498db",ax=axes[1])
ax.set_xticklabels(ax.get_xticklabels(),rotation=60);

> City with the most number of records is Washington and with the least records is Bonn. Now we have idea about regions coutries and cities let's look at the time data.

In [None]:
years = df['Year'].value_counts()

plt.figure(figsize=(14,6))
sns.barplot(years.index,years.values,color="#3498db")
plt.title("Number Of Records For Every Year")
plt.xticks(rotation=45);

> We have pretty much same amount of data for every year except 2020. So there's no biasness in here.

In [None]:
months = df.Month.value_counts()

plt.figure(figsize=(14,6))
sns.barplot(months.index,months.values,color="#3498db")
plt.title("Number Of Records For Every Month");

In [None]:
day = df.Day.value_counts()

plt.figure(figsize=(14,6))
sns.barplot(day.index,day.values,color="#3498db")
plt.title("Number Of Records For Every Day");

> I can say that there's no biasness in time data but still making assumtions based on time data for the whole world will be bad since most of the data is from US and north american countries. 

> Let's see Temperature distribution.

In [None]:
plt.figure(figsize=(14,6))
sns.distplot(df.AvgTemperature)
plt.title("Temperature Distribution");

> Looking at the distribution it's almost normal accept for -100 $^{\circ}$C since we have data from south pacific. So if we want to perform analysis based on some questions they should be specific for a region, country or a city maybe since data is polarizing for different places.

### Bivariate Exploration

> In this section i'll try to find relationships between different variables starting temperature over the years.

In [None]:
temp = df[['Year','AvgTemperature']]
group = temp.groupby("Year")

In [None]:
mean_temp = group.mean()
plt.figure(figsize=(14,5))
sns.lineplot(mean_temp.index,mean_temp.AvgTemperature,color="#2ecc71")
plt.xticks(mean_temp.index,rotation=90)
plt.title("Average Temperature For Every Year");

max_temp = group.max()
plt.figure(figsize=(14,5))
sns.lineplot(max_temp.index,max_temp.AvgTemperature,color="#2ecc71")
plt.xticks(max_temp.index,rotation=90)
plt.title("Maximum Temperature For Every Year");

min_temp = group.min()
plt.figure(figsize=(14,5))
sns.lineplot(min_temp.index,min_temp.AvgTemperature,color="#2ecc71")
plt.xticks(min_temp.index,rotation=90)
plt.title("Minimun Temperature For Every Year");

> Average temperature throughout the year is increasing with every year but it has slight decrease after year 2017 in both year 2018 and 2019. And since the whole COVID-19 situation i think it will gradually decrease this year also.

> Maximum temperature for every year shows no significant difference between years except for the year 2001 it had a little drop compared to other years.

> Minmum temperature for every year hasn't changed at all it's been -99 $^{\circ}$C for years. Maybe analyzing it for a certain region, country or city would be a better option.

> Let's plot same for the months and see how the temprature changes during the year.

In [None]:
temp = df[['Month','AvgTemperature']]
group = temp.groupby("Month")

In [None]:
mean_temp = group.mean()
plt.figure(figsize=(14,5))
sns.lineplot(mean_temp.index,mean_temp.AvgTemperature,color="#2ecc71")
plt.xticks(mean_temp.index,rotation=90)
plt.title("Average Temperature For Every Month");

max_temp = group.max()
plt.figure(figsize=(14,5))
sns.lineplot(max_temp.index,max_temp.AvgTemperature,color="#2ecc71")
plt.xticks(max_temp.index,rotation=90)
plt.title("Maximum Temperature For Every Month");

min_temp = group.min()
plt.figure(figsize=(14,5))
sns.lineplot(min_temp.index,min_temp.AvgTemperature,color="#2ecc71")
plt.xticks(min_temp.index,rotation=90)
plt.title("Minimun Temperature For Every Month");

> As expected the average temperature for months throughout the yaer is a gaussian curve. 

> Also looking at maximum temperature for different insight won't make any sense since also a gaussian curve throughout the year. And nothing changes for minimum temperatures.

> I don't think plotting the same graphs for day will give us any proper insight. so let's move on regions. Let's find out minimum,maximum and average temperatures for different regions.

In [None]:
temp = df[['Region','Country','City','AvgTemperature']]
group = temp.groupby(['Region'])

In [None]:
min_temp = group.mean()
plt.figure(figsize=(14,5))
sns.barplot(min_temp.index,min_temp.AvgTemperature,color="#2ecc71")
plt.xticks(rotation=60)
plt.title("Average Temperature For Every Region");

rows = []
for region in group.groups.keys():
    g = group.get_group(region)
    rows.append(g[g.AvgTemperature.max() == g.AvgTemperature].values[0])
    
t = pd.DataFrame(rows,columns=['Region','Country','City','Temp'])

plt.figure(figsize=(14,5))
p = sns.barplot(t.Region,t.Temp,color="#2ecc71")
plt.xticks(rotation=60)
plt.title("Maximum Temperature For Every Region");

for index, row in t.iterrows():
    p.text(index,35, f"{row.City}, {row.Country}", color='#333', ha="center",rotation=90)
    
    

rows = []
for region in group.groups.keys():
    g = group.get_group(region)
    rows.append(g[g.AvgTemperature.min() == g.AvgTemperature].values[0])
    
t = pd.DataFrame(rows,columns=['Region','Country','City','Temp'])

plt.figure(figsize=(14,5))
p = sns.barplot(t.Region,t.Temp,color="#2ecc71")
plt.xticks(rotation=60)
plt.title("Minimum Temperature For Every Region");

for index, row in t.iterrows():
    p.text(index,-65, f"{row.City}, {row.Country}", color='#333', ha="center",rotation=90)
    

> Average temperature for every region is closely similar , europe being the lowest. I also put the name of country and city with both minimum and maximum temperature so it could give a more clear idea about that region.

> Plotting the same for country or city would be too messy so for this notebook i'll move ahead.

### Multivariate Exploration.


I'm mostly a deep learning guy and i'm just trying my hand in data analytics so any sort of comment would be helpful 😄.