In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import geopandas as gpd
from shapely.geometry import Point, Polygon
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
from IPython.core.display import HTML
import gc
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import warnings
warnings.filterwarnings("ignore")

# Reading Data

In [None]:
df = pd.read_csv("../input/daily-temperature-of-major-cities/city_temperature.csv")
df.head()

Data has 4 object values, 3 integer values and 1 float values

In [None]:
df.info()

In [None]:
df.shape #(row,column)

# Examine the Data

In [None]:
df.Country.unique()

We will use US as a country. We can select it and create a dataframe.

In [None]:
us_df = df[df["Country"] == "US"]
us_df.head()

In [None]:
us_df.shape

While dealing with categorical data, we should examine the variables in it. We can do this by using value_counts() method and can see how many of which value.

In [None]:
us_df.Region.value_counts()

When we use value_counts() if there is a lot of variable, it will not show all variables to us. For avoid that we can use unique() method so we can see every variable in categoric feature.

In [None]:
print(us_df.City.unique())
print(len(us_df.City.unique()))

In [None]:
us_df.isnull().sum()

In [None]:
us_df.describe().T

For showing average temperature feature's minimum value (-99) we are using code below. There are 79672 rows equal to -99. We will drop this rows therefore we are holding their indexes in a list

In [None]:
avgtemp_drop_list = us_df[us_df.AvgTemperature  == -99].index
us_df[us_df.AvgTemperature  == -99]

In the below code we are dropping rows that average temperature equals to -99. I am also drop region feature to because it has just one variable for US.

In [None]:
us_df.drop(avgtemp_drop_list, axis=0, inplace=True)
us_df.drop(["Region"], axis = 1, inplace=True)

In [None]:
us_df["Time_steps"] = pd.to_datetime((us_df.Year*10000 + us_df.Month*100 + us_df.Day).apply(str),format='%Y%m%d')
us_df.head()

Since we only took the US country from the data set, its indexes came to the indexes in the old data set. We will use the reset_index () function in the pandas library to reset the indexes. We use the ***inplace = true*** parameter to make the change permanent.

In [None]:
us_df.reset_index(inplace=True)
us_df.drop(["index"], axis=1, inplace=True)
us_df.head()

In [None]:
us_df.describe().T

In [None]:
us_df["State"].value_counts()

# Visualization

## Visualization For World

Before starting the visualization, we copy the data set to the variable named df_world. When we look at the statistics of the data set, the average temperature seems to be the lowest value -99. These values are missing data. So we will drop these values.

In [None]:
df_world = df.copy()
df_world.describe().T

In [None]:
df_world.drop(df_world[df_world["AvgTemperature"] == -99].index, axis=0, inplace=True)

We need International Organization for Standardization (ISO) codes of countries for visualization. That's why we include this data set in the project.

In [None]:
iso_code = pd.read_csv('../input/iso-codes/iso_codes.csv')
iso_code = iso_code[['Country','ISO_Code']].drop_duplicates().reset_index(drop=True)
iso_code.head()

In [None]:
dfc = (
       df_world.groupby(['Year','Country'])['AvgTemperature'].agg(['mean'])
      .reset_index()
      .rename(columns={'mean': 'AvgTemperature'})
      .merge(iso_code,left_on='Country',right_on='Country')
      .sort_values(by=['Year','Country'])
      )
dfc['Rank_hottest'] = dfc.groupby(by=['Year'])['AvgTemperature'].rank(method="min",ascending=False)
dfc['Rank_coldest'] = dfc.groupby(by=['Year'])['AvgTemperature'].rank(method="min",ascending=True)

fig = (
   px.choropleth(
                 dfc               
                ,locations='ISO_Code'               
                ,color='AvgTemperature'
                ,hover_name='Country'  
                ,hover_data={'ISO_Code':False, 'Year':True,'AvgTemperature':':.2f'}
                ,animation_frame='Year'
                ,color_continuous_scale='Portland' 
                ,height=600)
  .update_layout(
                 title_text='WORLD AVERAGE TEMPERATURE'
                ,title_x=0.3
                ,margin=dict(r=10, t=40, b=10, l=10)
                ,coloraxis_colorbar_title='Temp °C')
)
fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 800
fig.show()

## General Visualization About United States

The following code shows the average temperatures of the states in the United States in January 1995. The year 1995 is the oldest year in the data set for the US.

In [None]:
us_1995_1 = us_df[(us_df["Year"] == 1995) & (us_df["Month"] == 1)]
plt.figure(figsize=(22,10))
plt.title("The Average Temperature of States in the United States in January 1995.")
sns.barplot(x = us_1995_1["State"], y=us_1995_1["AvgTemperature"], data=us_1995_1)
plt.xticks(rotation=90)
plt.show()

Now we will have the same graph drawn for 2020 and look at the difference. According to first impressions, there is a visible change between temperatures. While the average temperatures of some states have decreased, others have increased.

In [None]:
us_2020_1 = us_df[(us_df["Year"] == 2020) & (us_df["Month"] == 1)]
plt.figure(figsize=(22,10))
plt.title("The Average Temperature of States in the United States in January 2020.")
sns.barplot(x = us_2020_1["State"], y=us_2020_1["AvgTemperature"], data=us_2020_1)
plt.xticks(rotation=90)
plt.show()

## Visualization About Alaska

In [None]:
alaska_df = us_df[us_df["State"] == "Alaska"]
alaska_df_time = alaska_df.set_index('Time_steps')
sns.set(rc={'figure.figsize':(11, 4)})
alaska_df_time['AvgTemperature'].plot(linewidth=0.5);
plt.title("Average Temperature in Alaska by Years");

In [None]:
plt.figure(figsize=(22,10))
plt.title("Average Temperature of Alaska Cities by Years")
sns.barplot(x = alaska_df["Year"], y=alaska_df["AvgTemperature"],hue=alaska_df["City"], data=alaska_df)
plt.xticks(rotation=90)
plt.show()

## Visualization About Texas

According to the United States Environmental Protection Agency, the climate in Texas is already changing due to anthropogenic climate change. As of 2016, most of the state had already warmed by 1.5 degrees since the previous century because of global warming. Texas is expected to have a wide range of environmental impacts, including rising sea levels, increased extreme weather and wildfires, and pressure on water resources.

In [None]:
texas_df = us_df[us_df["State"] == "Texas"]
texas_df_time = texas_df.set_index('Time_steps')
sns.set(rc={'figure.figsize':(11, 4)})
texas_df_time['AvgTemperature'].plot(linewidth=0.5);
plt.title("Average Temperature in Texas by Years");

In [None]:
texas_df["City"].value_counts()

In [None]:
texas_df.describe().T

In [None]:
texas_compare_df = texas_df[(texas_df["Year"] == 1995) | (texas_df["Year"] == 2000) | (texas_df["Year"] == 2010) | (texas_df["Year"] == 2020)]
plt.figure(figsize=(22,10))
plt.title("Average Temperature of Texas Cities by Years")
sns.barplot(x = texas_compare_df["Year"], y=texas_compare_df["AvgTemperature"],hue=texas_compare_df["City"], data=texas_compare_df)
plt.xticks(rotation=90)
plt.show()

# ML Model For Texas

In [None]:
texas_df.reset_index(inplace=True)
texas_df.drop(["index"], axis=1, inplace=True)
texas_df.head()

In [None]:
texas_df.drop(["Country","Day","Time_steps","State"], axis=1, inplace=True)
texas_df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.pipeline import make_pipeline

In [None]:
le = LabelEncoder()
texas_df["City"] = le.fit_transform(texas_df["City"])
texas_df.head()

In [None]:
texas_df.info()

In [None]:
X = texas_df.drop(["AvgTemperature"], axis=1)
y = texas_df["AvgTemperature"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Decison Tree

In [None]:
tree = DecisionTreeRegressor()
tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)
r2_score(y_test, y_pred)

## Random Forest 

In [None]:
rf = RandomForestRegressor().fit(X_train, y_train)
r2_score(y_test, rf.predict(X_test))