# **Importing the needed Packages**

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import pylab as pl
import numpy as np
%matplotlib inline

# **Reading the File**

In [None]:
df = pd.read_csv('model_1.csv')
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

In [None]:
#converting the categorical value to the numerical value
df['product_category'].replace({"business": 1, "cleaning": 2, "entertainment": 3, "other": 4, "sport": 5, "tech": 6, "travel": 7},inplace=True)
df['day'].replace({"monday":1,"tuesday":2, "wednesday": 3, "thursday": 4, "friday": 5, "saturday": 6, "sunday": 7},inplace=True)

# **Visualizing the Data**

**Visualizing the data with likes less than 10000**

In [None]:
df1 = df[df['likes'] < 10000]

In [None]:
#Using the pairplot to understand the relationship of target variables with various variables
%matplotlib inline
import seaborn as sns
sns.pairplot(df1, y_vars = 'likes')

In [None]:
sns.distplot(df1['likes'])

In [None]:
#Visualizing the correlation using a heatmap
plt.figure(figsize=(30, 30))
sns.heatmap(data= df1.corr().round(2), cmap = 'coolwarm', linewidths = .5, annot = True, annot_kws ={"size":12})
plt.show()

**Visualizing the data with likes greater than 10000**

In [None]:
df2 = df[df['likes'] > 10000]

In [None]:
#Using the pairplot to understand the relationship of target variables with various variables
%matplotlib inline
import seaborn as sns
sns.pairplot(df2, y_vars = 'likes')

In [None]:
sns.distplot(df2['likes'])

In [None]:
#Visualizing the correlation using a heatmap
plt.figure(figsize=(30, 30))
sns.heatmap(data= df2.corr().round(2), cmap = 'coolwarm', linewidths = .5, annot = True, annot_kws ={"size":12})
plt.show()

In [None]:
#to understand the relationship of 14 variables with the target value
New_df= df[["num_hrefs", "num_imgs", "num_keywords", " self_reference_min_shares", " self_reference_max_shares", " self_reference_avg_sharess", "day", "topic_quality", "topic_description", "topic_others", " n_non_stop_unique_tokens", "topic_shipping", "topic_packaging","product_category"]]
New_df.hist(figsize=(10,10))

In [None]:
df3 = df
import math
df3['likes2']=df3['likes'].apply(lambda x: math.log(x+1))
df3['num_hrefs2']=df3['num_hrefs'].apply(lambda x: math.log(x+1))
df3['nnum_imgs2']=df3['num_imgs'].apply(lambda x: math.log(x+1))
df4= df3[["num_hrefs", "num_imgs", "num_keywords", " self_reference_min_shares", " self_reference_max_shares", " self_reference_avg_sharess", "day", "topic_quality", "topic_description", "topic_others", " n_non_stop_unique_tokens", "topic_shipping", "topic_packaging","product_category", "likes"]]
plt.figure(figsize=(30, 30))
sns.heatmap(data= df4.corr().round(2), cmap = 'coolwarm', linewidths = .5, annot = True, annot_kws ={"size":12})
plt.show()

In [None]:
#creating the test train split

df5 = df
from sklearn.model_selection import train_test_split
#14 selected variable as an independent variable
X= df5[["num_hrefs", "num_imgs", "num_keywords", " self_reference_min_shares", " self_reference_max_shares", " self_reference_avg_sharess", "day", "topic_quality", "topic_description", "topic_others", " n_non_stop_unique_tokens", "topic_shipping", "topic_packaging","product_category"]]
#conversion of y to comparable log values
Y= df5['likes'].apply(lambda x: math.log(x+1))
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.3, random_state=42)
X_train.head(10)

# **Models For Regression**

In [None]:
from sklearn.model_selection import GridSearchCV
import math

def gs_regression(model, par) :
    gs = GridSearchCV(model, par,cv=3,scoring ='neg_mean_absolute_error')
    gs = gs.fit(X_train,y_train)

    #summarizing the GRIDSEARCH RESULTS
    print('***GRIDSEARCH RESULTS***')
    print("Best score: %f using %s" % (gs.best_score_, gs.best_params_))
    means = gs.cv_results_['mean_test_score']
    stds = gs.cv_results_['std_test_score']
    params = gs.cv_results_['params']


    y_pred_train=gs.predict(X_train)
    y_pred_test=gs.predict(X_test)
    #conversion of y value to original value
    y_train_exp=y_train.apply(lambda x: math.exp(x)-1)
    y_test_exp=y_test.apply(lambda x: math.exp(x)-1)
    y_pred_train_exp=np.exp(y_pred_train)-1
    y_pred_test_exp=np.exp(y_pred_test)-1


    from sklearn import metrics
    print()
    print("MAE  train %.3f (%f)  test %.3f (%f)" % (metrics.mean_absolute_error(y_train, y_pred_train), metrics.mean_absolute_error(y_train_exp, y_pred_train_exp) ,metrics.mean_absolute_error(y_test, y_pred_test),  metrics.mean_absolute_error(y_test_exp, y_pred_test_exp)  ) )
    print("MSE  train %.3f              test %.3f" % (metrics.mean_squared_error(y_train, y_pred_train), metrics.mean_squared_error(y_test, y_pred_test)) )
    print("RMSE train %.3f              test %.3f" % (np.sqrt(metrics.mean_squared_error(y_train, y_pred_train)), np.sqrt(metrics.mean_squared_error(y_test, y_pred_test))) )
    print("r2   train %.3f              test %.3f" % (metrics.r2_score(y_train, y_pred_train), metrics.r2_score(y_test, y_pred_test)) )

In [None]:
#linear Regression
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
parameters = {}

gs_regression(regressor, parameters)

In [None]:
#Ridge Regression
from sklearn.linear_model import Ridge

regressor = Ridge()
parameters = {"alpha": [0.001,0.01,0.1,1,10], "normalize": [True, False]}

gs_regression(regressor, parameters)

In [None]:
#Lasso Regression
from sklearn.linear_model import Lasso

regressor = Lasso()
parameters = {"alpha": [0.001,0.01,0.1,1,10], "normalize": [True, False]}

gs_regression(regressor, parameters)

In [None]:
#KNN Regressor
from sklearn.neighbors import KNeighborsRegressor

regressor = KNeighborsRegressor()

parameters = {'n_neighbors': np.arange(20,50,10),
              'p': [1,2]
            }

gs_regression(regressor, parameters)

In [None]:
#Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor()
parameters = {"max_depth": np.arange(1,5),
              "min_samples_leaf": np.arange(1,5)}

gs_regression(regressor, parameters)


In [None]:
#RandomForest Regressor
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor()
parameters = {"n_estimators":[10,50, 100,200], "criterion": ['mse'],
              "min_samples_leaf": [0.07, 1.0, 0.3], "random_state" : [42]}

gs_regression(regressor, parameters)


**Model Selection**

In [None]:
#We have selected the KNN Regressor as predictive model
x= df[["num_hrefs", "num_imgs", "num_keywords", " self_reference_min_shares", " self_reference_max_shares", " self_reference_avg_sharess", "day", "topic_quality", "topic_description", "topic_others", " n_non_stop_unique_tokens", "topic_shipping", "topic_packaging","product_category"]]
Y= df.iloc[:, -1]
y = Y.apply(lambda x: math.log(x+1))

from sklearn.neighbors import KNeighborsRegressor
regressor = KNeighborsRegressor()

parameters = {'n_neighbors': np.arange(20,50,10),
              'p': [1,2]
            }

#gs_regression(regressor, parameters)

from sklearn.model_selection import GridSearchCV
KNN = GridSearchCV(regressor, parameters, cv=3)

KNN.fit(x,Y)

y_pred=KNN.predict(x)
error=Y-y_pred


In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.scatter(y_pred,error, c="b", label="training data")
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.legend(loc="upper left")
plt.hlines(y=0, xmin=0, xmax=8, color="r")
plt.xlim([0,8])
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler
nb_error = np.array(error).flatten()

error = np.array(error).reshape(-1,1)
scaled_error= StandardScaler(copy=False).fit(error).transform(error).flatten()

In [None]:
import numpy as np
import scipy
import statsmodels.api as sm
from statsmodels.graphics.gofplots import qqplot_2samples
from matplotlib import pyplot as plt
import seaborn as sns

dist = getattr(scipy.stats, 'norm')
param = dist.fit(nb_error)

err_mean=param[-2]
err_std=param[-1]

test_dist = dist.rvs(*param[0:-2],loc=param[-2], scale=param[-1])

ax = sns.histplot(nb_error, stat='density')

# calculating the pdf
x0, x1 = ax.get_xlim()
x_pdf = np.linspace(x0, x1, 100)
y_pdf = scipy.stats.norm.pdf(x_pdf, loc=err_mean, scale=err_std)


In [None]:
import pickle
pickle.dump(KNN, open('KNN_model.pkl', 'wb'))

In [None]:
df_p = pd.read_csv('predictions.csv')

In [None]:
df_p['product_category'].replace({"business": 1, "cleaning": 2, "entertainment": 3, "other": 4, "sport": 5, "tech": 6, "travel": 7},inplace=True)
df_p['day'].replace({"monday":1,"tuesday":2, "wednesday": 3, "thursday": 4, "friday": 5, "saturday": 6, "sunday": 7},inplace=True)

X2= df_p[["num_hrefs", "num_imgs", "num_keywords", " self_reference_min_shares", " self_reference_max_shares", " self_reference_avg_sharess", "day", "topic_quality", "topic_description", "topic_others", " n_non_stop_unique_tokens", "topic_shipping", "topic_packaging","product_category"]]


In [None]:
# load model
loaded_model = pickle.load(open('KNN_model.pkl', 'rb'))

#model.fit(X_train,y_train)
# we could retrain with the entire dataset

y2_predictions=loaded_model.predict(X2)

In [None]:
print(y2_predictions)

In [None]:
pd.DataFrame(y2_predictions).to_csv("values.csv")

In [None]:

y2_pred=np.exp(y2_predictions)-1
print(y2_pred)
pd.DataFrame(y2_pred).to_csv("values2.csv")