In [None]:
import pandas as pd
import numpy as np

In [None]:
df=pd.read_csv("happiness_score.csv")

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.isnull().sum()

# EDA

In [None]:
import seaborn as sns


In [None]:
sns.scatterplot(x="Happiness Score",y="Economy (GDP per Capita)",data=df)

In [None]:
sns.scatterplot(x="Happiness Score",y="Health (Life Expectancy)",data=df)

In [None]:
sns.scatterplot(x="Happiness Score",y="Freedom",data=df)

In [None]:
sns.scatterplot(x="Happiness Score",y="Trust (Government Corruption)",data=df)

In [None]:
sns.pairplot(df)

# correlation

In [None]:
df.corr()

In [None]:
df.corr()['Happiness Score'].sort_values()

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(15,7))
sns.heatmap(df.corr(),annot=True,linewidths=0.5,linecolor='black',fmt='.2f')

# Descriptive Statistics

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(15,12))
sns.heatmap(round(df.describe()[1:].transpose(),2),linewidth=2,annot=True,fmt="f")
plt.xticks(fontsize=18)
plt.yticks(fontsize=12)
plt.title("Variables summary")
plt.show()


In [None]:
df.info()

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
collist=df.columns.values[2:]
ncol=30
nrows=14
plt.figure(figsize=(ncol,3*ncol))
for i in range(0,len(collist)):
    plt.subplot(nrows,ncol,i+1)
    sns.boxplot(data= df[collist[i]],color='green',orient='v')
    plt.tight_layout()

In [None]:
df.skew()

In [None]:
sns.distplot(df['Standard Error'])

In [None]:
sns.distplot(df['Health (Life Expectancy)'])

In [None]:
sns.distplot(df['Trust (Government Corruption)'])

In [None]:
sns.distplot(df['Generosity'])

In [None]:
df.corr()['Happiness Score']

In [None]:
df1=df.drop(['Country','Region','Happiness Rank'],axis=1)

# zscore

In [None]:
from scipy.stats import zscore
import numpy as np
z=np.abs(zscore(df1))
z.shape

In [None]:
threshold=3
print(np.where(z>3))

In [None]:
len(np.where(z>3)[0])

In [None]:
df.drop([ 27,  40,  64, 115, 128, 147, 153, 155, 157],axis=0)
df

In [None]:
df_new=df[(z<3).all(axis=1)]
print("old Dataframe",df.shape)
print("New Dataframe",df_new.shape)
print("total_dropped_rows",df.shape[0] - df_new.shape[0])

In [None]:
loss_percent=(158-149)/158*100
print(loss_percent,'%')

In [None]:
x=df_new.iloc[:,4:]
y=df_new.iloc[:,3]

In [None]:
from sklearn.preprocessing import power_transform
x=power_transform(x,method='yeo-johnson')

In [None]:
x

In [None]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X=sc.fit_transform(x)
X

# Testing Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
lm=LinearRegression()

from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
clf = RandomForestRegressor(n_estimators=10)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=i,test_size=0.20)
lm.fit(x_train,y_train)
pred_train=lm.predict(x_train)
pred_test=lm.predict(x_test)
print(pred_train)
print(pred_test)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=43,test_size=0.20)

In [None]:
x_train.shape

In [None]:
y_train.shape

In [None]:
x_test.shape

In [None]:
y_test.shape

In [None]:
lm=LinearRegression()

In [None]:
lm.fit(x_train,y_train)

In [None]:
lm.coef_

In [None]:
lm.intercept_

In [None]:
lm.score(x_train,y_train)

In [None]:
import sklearn
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
print('Mean absolute error:',mean_absolute_error(y_test,pred))

In [None]:
print('Root Mean Squared error:',np.sqrt(mean_squared_error(y_test,pred)))

In [None]:
from sklearn.metrics import r2_score
print(r2_score(y_test,pred))

In [None]:
pred=lm.predict(x_test)
pf=pd.DataFrame(pred,y_test)
pf

In [None]:
pf.to_csv('Happiness_pred_submission.csv')

# KFold Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import RFE

In [None]:
# k-fold CV (using all the 13 variables)
lm = LinearRegression()
scores = cross_val_score(lm, x_train, y_train, scoring='r2', cv=5)
scores      

In [None]:

folds = KFold(n_splits = 5, shuffle = True, random_state = 43)


hyper_params = [{'n_features_to_select': list(range(1, 14))}]



lm = LinearRegression()
lm.fit(x_train, y_train)
rfe = RFE(lm) 

model_cv = GridSearchCV(estimator = rfe, 
                        param_grid = hyper_params, 
                        scoring= 'r2', 
                        cv = folds, 
                        verbose = 1,
                        return_train_score=True)      


model_cv.fit(x_train, y_train) 

In [None]:
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results.head()

In [None]:
plt.figure(figsize=(16,6))

plt.plot(cv_results["param_n_features_to_select"], cv_results["mean_test_score"])
plt.plot(cv_results["param_n_features_to_select"], cv_results["mean_train_score"])
plt.xlabel('number of features')
plt.ylabel('r-squared')
plt.title("Optimal Number of Features")
plt.legend(['test score', 'train score'], loc='upper left')

In [None]:
n_features_optimal = 10

lm = LinearRegression()
lm.fit(x_train, y_train)

rfe = RFE(lm, n_features_to_select=n_features_optimal)             
rfe = rfe.fit(x_train, y_train)

# predict prices of X_test
y_pred = lm.predict(x_test)
r2 = sklearn.metrics.r2_score(y_test, y_pred)
print(r2)