## Data Preprocessing and Feature Engineering
This is an important step in machine learning. Here we will perfrom data scaling, either normalisation or standardisation, which will assist the ML model in making accurate predictions. 

In [61]:
#import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import statsmodels.graphics.correlation as sgc
from statsmodels.graphics.gofplots import qqplot
import statsmodels.stats.api as sms
from statsmodels.stats.outliers_influence import OLSInfluence


ModuleNotFoundError: No module named 'statsmodels'

In [18]:
#import cleaned df csv

df = pd.read_csv("Regression project cleaned dataset", index_col=None)

#drop unamed column - exported incorrectly - index should be set to False
##df = df.drop(columns='Unnamed: 0')
#output top 5 rows
df.head()

Unnamed: 0,Country,Year,Savanna_fires,Forest_fires,Crop_Residues,Rice_Cultivation,Drained_organic_soils_(CO2),Pesticides_Manufacturing,Food_Transport,Forestland,...,Manure_Management,Fires_in_organic_soils,Fires_in_humid_tropical_forests,On-farm_energy_use,Rural_population,Urban_population,Total_Population_-_Male,Total_Population_-_Female,total_emission,Average_Temperature_°C
0,Afghanistan,1993,14.7237,0.0557,230.8175,686.0,0.0,11.712073,54.3617,-2388.803,...,352.2947,0.0,0.0,140.6888,11858090.0,3237009.0,7003641.0,7000119.0,2368.470529,0.101917
1,Afghanistan,1994,14.7237,0.0557,242.0494,705.6,0.0,11.712073,53.9874,-2388.803,...,367.6784,0.0,0.0,140.6888,12690115.0,3482604.0,7733458.0,7722096.0,2500.768729,0.37225
2,Afghanistan,1995,14.7237,0.0557,243.8152,666.4,0.0,11.712073,54.6445,-2388.803,...,397.5498,0.0,0.0,140.6888,13401971.0,3697570.0,8219467.0,8199445.0,2624.612529,0.285583
3,Afghanistan,1996,38.9302,0.2014,249.0364,686.0,0.0,11.712073,53.1637,-2388.803,...,465.205,0.0,0.0,140.6888,13952791.0,3870093.0,8569175.0,8537421.0,2838.921329,0.036583
4,Afghanistan,1997,30.9378,0.1193,276.294,705.6,0.0,11.712073,52.039,-2388.803,...,511.5927,0.0,0.0,140.6888,14373573.0,4008032.0,8916862.0,8871958.0,3204.180115,0.415167


#### Perform Encoding for Categorical or Non-Numeric features
We might not need this

In [16]:
#dummy variable endcoding for categorical variables
#df_encoded = pd.get_dummies(df, drop_first = True)

(6270, 257)

### Feature scaling 
I'm going to use standardisation - it involves centering values around the mean and adjusting the standard deviation to one unit. Using Sklearns StandardScaler from the sklearn - preprocessing package. The reason for using Standardisation is because it handles outliers gracefully. We noticed that the dataset contains a number of outliers in various columns.

In [57]:
#split data in predictor and response variables
X = df.drop(['Average_Temperature_°C','Country', 'Year'], axis = 1)

y = df['Average_Temperature_°C']

#Scale our X features
scaler = StandardScaler()

# Save standardized features into new variable
X_scaled = scaler.fit_transform(X)


#### Create Train Test Split
Here we're going to split our data into a training set and a testing set. We'll apply the 20% rule where 20% of our X and y features will be used for testing the model.

In [59]:
#create train test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.20, random_state = 42)


#### Select and develop three primary models
I'm going to make use of the following models:
1. Linear Regression
2. Ridge Regression
3. Random Forest Regressor

We can add more or change it up later if need be.

#### Instantiate regression model objects

In [35]:

# Instantiate Linear Regression Object
lr_model = LinearRegression()

# Instantiate Ridge Regression Object
ridge_model = Ridge(alpha = 0.1)

# Instantiate Random Forest Object
rf_model = RandomForestRegressor(n_estimators = 50, max_depth = 5) #our forest consists out of 50 trees with a max depth of 5 in this example

#### Fit model to training data and predict y (response variable) - Average_Temperature_°C 

In [37]:
#train linear model
lr_model.fit(X_train, y_train)

#train ridge model
ridge_model.fit(X_train, y_train)

#train
rf_model.fit(X_train, y_train)

#predict y(response variable)
y_pred_lr = lr_model.predict(X_test) #predict using linear model
y_pred_ridge = lr_model.predict(X_test) #predict using ridge model
y_pred_rf = lr_model.predict(X_test) #predict using random forest model

#### Evaluate model performance