<a href="https://www.kaggle.com/code/sreemontiduttabanik/global-warming?scriptVersionId=190784150" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
#from sklearn.tree import DecisionTreeRegressor
#from sklearn.linear_model import LogisticRegression
#from sklearn.metrics import accuracy_score
os.listdir('../input/climate-change-earth-surface-temperature-data')

In [None]:
GloTemp=pd.read_csv('../input/climate-change-earth-surface-temperature-data/GlobalTemperatures.csv')

In [None]:
print("FIRST FEW RECORDS:")
GloTemp.head()

In [None]:
GloTemp.shape

In [None]:
GloTemp.columns

In [None]:
GloTemp.info()

In [None]:
GloTemp.tail()

In [None]:
#Original data
GloTemp.describe()

In [None]:
duplicate_rows=GloTemp[GloTemp.duplicated()]
print("No. of duplicate rows:",duplicate_rows.shape)

In [None]:
GloTemp=GloTemp.drop_duplicates()
GloTemp.shape

In [None]:
#checking for null values
GloTemp.isnull().sum()

In [None]:
#removing null values
GloTemp=GloTemp.dropna()
print("After removing rows with null data:")
GloTemp.count()

In [None]:
GloTemp.head()

In [None]:
#Data after cleaning
GloTemp.describe()

In [None]:
#Convert the celsius temperatures to farenheit
def convert_to_farenheit(temp):
    farenheit_temp=(temp*1.8)+32
    return farenheit_temp
temp_col=['LandAverageTemperature','LandMaxTemperature','LandMinTemperature','LandAndOceanAverageTemperature']
GloTemp[temp_col] = GloTemp[temp_col].apply(convert_to_farenheit)
GloTemp.head()

In [None]:
#Convert 'dt' to datetime format
def convert_to_datetime(temp_df):
    temp_df=temp_df.copy()
    temp_df['dt']=pd.to_datetime(temp_df['dt'])
    temp_df['month']=temp_df['dt'].dt.month
    temp_df['Year']=temp_df['dt'].dt.year
    return temp_df

new_glo_temp=convert_to_datetime(GloTemp)
new_glo_temp=new_glo_temp.drop(['dt','month'], axis=1)    #axis value indicates rows(0) or columns(1) 
new_glo_temp=new_glo_temp.set_index('Year')
new_glo_temp=new_glo_temp.reset_index()
new_glo_temp.head()

In [None]:
#Plotting the average land temperature with higher and lower bounds
def plot_avg_temp(df,col1,col2,label):
    cols=[col1,col2]
    temp_df=df[cols]
    average_temp_per_year=temp_df.groupby(temp_df.index)[cols].mean()
    average_temp_per_year['Higher_temp']=average_temp_per_year[col1]+average_temp_per_year[col2]
    average_temp_per_year['Lower_temp']=average_temp_per_year[col1]-average_temp_per_year[col2]
    
    plt.figure(figsize=(10,5))
    plt.plot(average_temp_per_year.index, average_temp_per_year[col1], color='maroon', label='Average Temp')
    plt.plot(average_temp_per_year.index, average_temp_per_year['Higher_temp'], color='navy', alpha=0.7)
    plt.plot(average_temp_per_year.index, average_temp_per_year['Lower_temp'], color='navy', alpha=0.7)
    plt.fill_between(average_temp_per_year.index, average_temp_per_year['Higher_temp'], average_temp_per_year['Lower_temp'], color='skyblue', alpha=0.2, label='Temperature Uncertainty Boundaries')
    
    plt.xlabel("Year")
    plt.ylabel("Average Temperature")
    plt.title(label)
    plt.legend(loc='best')
    plt.grid()

#call the function 
plot_avg_temp(new_glo_temp,'LandAverageTemperature','LandAverageTemperatureUncertainty','Land Average Temperature Per Year')

In [None]:
#Plotting the average land and ocean temperature with higher and lower bounds
def plot_land_ocean_avg_temp(df,col1,col2,label):
    cols=[col1,col2]
    temp_df=df[cols]
    average_temp_per_year=temp_df.groupby(temp_df.index)[cols].mean()
    average_temp_per_year['Higher_temp']=average_temp_per_year[col1]+average_temp_per_year[col2]
    average_temp_per_year['Lower_temp']=average_temp_per_year[col1]-average_temp_per_year[col2]
    
    plt.figure(figsize=(10,5))
    plt.plot(average_temp_per_year.index, average_temp_per_year[col1], color='maroon', label='Average Temp')
    plt.plot(average_temp_per_year.index, average_temp_per_year['Higher_temp'], color='navy', alpha=0.7)
    plt.plot(average_temp_per_year.index, average_temp_per_year['Lower_temp'], color='navy', alpha=0.7)
    plt.fill_between(average_temp_per_year.index, average_temp_per_year['Higher_temp'], average_temp_per_year['Lower_temp'], color='skyblue', alpha=0.2, label='Temperature Uncertainty Boundaries')
    
    plt.xlabel("Year")
    plt.ylabel("Average Temperature")
    plt.title(label)
    plt.legend(loc='best')
    plt.grid()

#call the function 
plot_avg_temp(new_glo_temp,'LandAndOceanAverageTemperature','LandAndOceanAverageTemperatureUncertainty','Land & Ocean Average Temperature Per Year')

In [None]:
year_in_intervals = [1850, 1890, 1930, 1970, 2010]
temp_interval_df = new_glo_temp[new_glo_temp.index.isin(year_in_intervals)]
plt.figure(figsize=(8,4))
sns.boxplot(x=temp_interval_df.index, y=temp_interval_df['LandAndOceanAverageTemperature'], data =temp_interval_df, palette='Reds')
plt.xlabel("Years")
plt.ylabel("Average Temperature")
plt.title("Average Land & Ocean Temperature in 40 Year Intervals")

In [None]:
os.listdir('../input/annual-greenhouse-gas-emission-accounts/')

In [None]:
ghg_df=pd.read_csv('../input/annual-greenhouse-gas-emission-accounts/Annual_Greenhouse_Gas_(GHG)_Air_Emissions_Accounts.csv')

In [None]:
ghg_df.head()
ghg_df.tail()
ghg_df.columns

In [None]:
ghg_df.isnull().sum()
ghg_df=ghg_df.drop(['ISO2','ISO3','Indicator','Unit','Source','CTS_Code','CTS_Name','CTS_Full_Descriptor','Seasonal_Adjustment','Scale'], axis=1)
ghg_df.info()

In [None]:
ghg_df.Country.unique()

In [None]:
drop_rows=['Advanced Economies','Emerging and Developing Economies','G20','G7','Africa', 'Americas','Asia','World','Europe','Other Oceania sub-regions','Oceania']
ghg_df.drop(ghg_df[ghg_df['Country'].isin(drop_rows)].index, inplace = True)
ghg_df.Country.unique()

In [None]:
ghg_df.head()
ghg_df.set_index('ObjectId2')

In [None]:
ghg_df.Industry.unique()
ghg_df.dropna()

In [None]:
ghg_df['Industry'].value_counts()

In [None]:
#sources of emission

plt.figure()
val=[63,63,63,70,63,70,66,70,67,63]
var=['Agriculture, Forestry and Fishing', 'Construction',
       'Electricity, Gas, Steam and Air Conditioning Supply',
       'Manufacturing', 'Mining', 'Other Services Industries',
       'Total Households', 'Total Industry and Households',
       'Transportation and Storage',
       'Water supply, sewerage, waste management and remediation activities']
plt.pie(val,labels=var,autopct='%.2f')
plt.title("Sources Of Greenhouse Gas Emission")

In [None]:
#Average emmision df from 2010-2022
cols=['F2010','F2011','F2012','F2013','F2014','F2015','F2016','F2017','F2018','F2019','F2020','F2021','F2022']
avg_emm_df_og=ghg_df.groupby(['Gas_Type'])[cols].mean()
avg_emm_df=ghg_df.groupby(['Gas_Type'])[cols].mean()
avg_emm_df["Avg_Emission"]=avg_emm_df[cols].mean(axis=1)
avg_emm_clean=avg_emm_df.drop(avg_emm_df[avg_emm_df.index=='Greenhouse gas'].index)
avg_emm_clean.head()

In [None]:
#Gas emission
plt.figure()
avg_emm=avg_emm_clean["Avg_Emission"].to_list()
#gas_type=['Carbon Dioxide','Fluorinated Gas','Greenhouse Gas']
plt.pie(avg_emm,labels=avg_emm_clean.index)
plt.title("Emission of Various GreenHouse Gases")
plt.show()

In [None]:
#Year-wise level of all Greenhouse Gases
years=['F2010','F2011','F2012','F2013','F2014','F2015','F2016','F2017','F2018','F2019','F2020','F2021','F2022']
row=avg_emm_df_og[avg_emm_df_og.index=='Greenhouse gas'].iloc[0]
row_list=row.tolist()
plt.figure(figsize=(10,5))
plt.plot(years,row_list, color="blue")
plt.title("Year-wise level of Greenhouse Gases")

In [None]:
sns.heatmap(new_glo_temp.corr())

In [None]:
#Train Linear Regression Model
new_glo_temp=new_glo_temp.reset_index()
cols=['LandAverageTemperature', 'LandAverageTemperatureUncertainty',
       'LandMaxTemperature', 'LandMaxTemperatureUncertainty',
       'LandMinTemperature', 'LandMinTemperatureUncertainty',
       'LandAndOceanAverageTemperature',
       'LandAndOceanAverageTemperatureUncertainty']
new_glo_temp=new_glo_temp.groupby(new_glo_temp['Year'])[cols].mean()
new_glo_temp.head()
X = new_glo_temp.iloc[:, :-2]
y = new_glo_temp.iloc[:, [6,7]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Evaluate the model
train_mse = mean_squared_error(y_train, y_pred_train)
test_mse = mean_squared_error(y_test, y_pred_test)
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)

print("Training MSE:", train_mse)
print("Testing MSE:", test_mse)
print("Training R2:", train_r2)
print("Testing R2:", test_r2)

In [None]:
#Visualize the prediction against actual data
plt.scatter(X_train['Year'],y_train)
plt.plot(X_train['Year'],y_pred_train, color="red")
plt.title("Actual vs Predicted data")

In [None]:
#Visualize the prediction against test data
plt.scatter(X_train['Year'],y_train)
plt.plot(X_train['Year'],y_pred_train, color="red")
plt.title("Test vs Predicted data")

In [None]:
avg_emm_clean_trans=avg_emm_clean.transpose().reset_index()
avg_emm_clean_trans.head()

X=
y=

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Evaluate the model
train_mse = mean_squared_error(y_train, y_pred_train)
test_mse = mean_squared_error(y_test, y_pred_test)
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)

print("Training MSE:", train_mse)
print("Testing MSE:", test_mse)
print("Training R2:", train_r2)
print("Testing R2:", test_r2)