In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../input/facebook-metrics/dataset_Facebook.csv', sep=';')
df.head()

Unnamed: 0,Page total likes,Type,Category,Post Month,Post Weekday,Post Hour,Paid,Lifetime Post Total Reach,Lifetime Post Total Impressions,Lifetime Engaged Users,Lifetime Post Consumers,Lifetime Post Consumptions,Lifetime Post Impressions by people who have liked your Page,Lifetime Post reach by people who like your Page,Lifetime People who have liked your Page and engaged with your post,comment,like,share,Total Interactions
0,139441,Photo,2,12,4,3,0.0,2752,5091,178,109,159,3078,1640,119,4,79.0,17.0,100
1,139441,Status,2,12,3,10,0.0,10460,19057,1457,1361,1674,11710,6112,1108,5,130.0,29.0,164
2,139441,Photo,3,12,3,3,0.0,2413,4373,177,113,154,2812,1503,132,0,66.0,14.0,80
3,139441,Photo,2,12,2,10,1.0,50128,87991,2211,790,1119,61027,32048,1386,58,1572.0,147.0,1777
4,139441,Photo,2,12,2,3,0.0,7244,13594,671,410,580,6228,3200,396,19,325.0,49.0,393


7 features known prior to post publication and 12 features for evaluating post impact so 7 features will be taken for the model implementation  
Since the main goal of this model to predict only Total Intercations, I will drop comment,like and share columns as well

In [3]:
df.drop(df.iloc[:, 7:18], inplace = True, axis = 1)
df

Unnamed: 0,Page total likes,Type,Category,Post Month,Post Weekday,Post Hour,Paid,Total Interactions
0,139441,Photo,2,12,4,3,0.0,100
1,139441,Status,2,12,3,10,0.0,164
2,139441,Photo,3,12,3,3,0.0,80
3,139441,Photo,2,12,2,10,1.0,1777
4,139441,Photo,2,12,2,3,0.0,393
...,...,...,...,...,...,...,...,...
495,85093,Photo,3,1,7,2,0.0,84
496,81370,Photo,2,1,5,8,0.0,75
497,81370,Photo,1,1,5,2,0.0,115
498,81370,Photo,3,1,4,11,0.0,136


There are one Nan value in Paid cloumn and I will fill it with 0

In [4]:
df['Paid'] = df['Paid'].fillna(0)
df.isnull().any()

Page total likes      False
Type                  False
Category              False
Post Month            False
Post Weekday          False
Post Hour             False
Paid                  False
Total Interactions    False
dtype: bool

#### Binarizing categorical variables

Since Type column is categorical I will binarize them

In [5]:
# function to covert week days to integers from 1 to 7
def covert_weekdays(x):
    if x == 1:
        return 'Sunday'
    elif x== 2:
        return 'Monday'
    elif x == 3:
        return 'Tuesday'
    elif x == 4:
        return 'Wendesday'
    elif x == 5:
        return 'Thursday'
    elif x ==6:
        return 'Friday'
    elif x == 7:
        return "Saturay"
    
df['Weekday'] = df['Post Weekday'].apply(lambda x: covert_weekdays(x))

In [6]:
dayDf = pd.get_dummies(df['Weekday'])
dayDf

Unnamed: 0,Friday,Monday,Saturay,Sunday,Thursday,Tuesday,Wendesday
0,0,0,0,0,0,0,1
1,0,0,0,0,0,1,0
2,0,0,0,0,0,1,0
3,0,1,0,0,0,0,0
4,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...
495,0,0,1,0,0,0,0
496,0,0,0,0,1,0,0
497,0,0,0,0,1,0,0
498,0,0,0,0,0,0,1


In [7]:
df = pd.concat([df,dayDf],axis=1)

In [8]:
hours = list(range(0,18))
for i in hours:
    hours[i] = str(hours[i])
    hours[i]='Hour_'+ hours[i]

In [9]:
hourDf = pd.get_dummies(df['Post Hour'],prefix='hr_')
df = pd.concat([df,hourDf],axis=1)
monthDf = pd.get_dummies(df['Post Month'],prefix='Mo')
df = pd.concat([df,monthDf],axis=1)
df['Video'] = pd.get_dummies(df['Type'])['Video']
df['Status'] = pd.get_dummies(df['Type'])['Status']
df['Photo'] = pd.get_dummies(df['Type'])['Photo']
df['Category_1'] = pd.get_dummies(df['Category'])[1]
df['Category_2'] = pd.get_dummies(df['Category'])[2]
df['Category_3'] = pd.get_dummies(df['Category'])[3]
df

Unnamed: 0,Page total likes,Type,Category,Post Month,Post Weekday,Post Hour,Paid,Total Interactions,Weekday,Friday,...,Mo_9,Mo_10,Mo_11,Mo_12,Video,Status,Photo,Category_1,Category_2,Category_3
0,139441,Photo,2,12,4,3,0.0,100,Wendesday,0,...,0,0,0,1,0,0,1,0,1,0
1,139441,Status,2,12,3,10,0.0,164,Tuesday,0,...,0,0,0,1,0,1,0,0,1,0
2,139441,Photo,3,12,3,3,0.0,80,Tuesday,0,...,0,0,0,1,0,0,1,0,0,1
3,139441,Photo,2,12,2,10,1.0,1777,Monday,0,...,0,0,0,1,0,0,1,0,1,0
4,139441,Photo,2,12,2,3,0.0,393,Monday,0,...,0,0,0,1,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,85093,Photo,3,1,7,2,0.0,84,Saturay,0,...,0,0,0,0,0,0,1,0,0,1
496,81370,Photo,2,1,5,8,0.0,75,Thursday,0,...,0,0,0,0,0,0,1,0,1,0
497,81370,Photo,1,1,5,2,0.0,115,Thursday,0,...,0,0,0,0,0,0,1,1,0,0
498,81370,Photo,3,1,4,11,0.0,136,Wendesday,0,...,0,0,0,0,0,0,1,0,0,1


In [10]:
# Drop Type,Post Month,Post Weekday and Post hour columns from the dataframe
df.drop(['Category','Type','Post Month','Post Hour','Post Weekday','Weekday'], axis=1, inplace=True)
df

Unnamed: 0,Page total likes,Paid,Total Interactions,Friday,Monday,Saturay,Sunday,Thursday,Tuesday,Wendesday,...,Mo_9,Mo_10,Mo_11,Mo_12,Video,Status,Photo,Category_1,Category_2,Category_3
0,139441,0.0,100,0,0,0,0,0,0,1,...,0,0,0,1,0,0,1,0,1,0
1,139441,0.0,164,0,0,0,0,0,1,0,...,0,0,0,1,0,1,0,0,1,0
2,139441,0.0,80,0,0,0,0,0,1,0,...,0,0,0,1,0,0,1,0,0,1
3,139441,1.0,1777,0,1,0,0,0,0,0,...,0,0,0,1,0,0,1,0,1,0
4,139441,0.0,393,0,1,0,0,0,0,0,...,0,0,0,1,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,85093,0.0,84,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
496,81370,0.0,75,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,1,0
497,81370,0.0,115,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,1,0,0
498,81370,0.0,136,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,1


#### Outlier removal

I will select 90th percentile as the boundary

In [11]:
# getting the edge value of total interactions
outlier = np.percentile(df['Total Interactions'],90)
outlier

409.1

In [12]:
df = df[df['Total Interactions']<outlier]

In [13]:
scaler = StandardScaler()
likes = df['Page total likes']
likes =likes.values.reshape(-1,1)
scaler.fit(likes)
df['Page total likes'] = scaler.transform(likes)
df

Unnamed: 0,Page total likes,Paid,Total Interactions,Friday,Monday,Saturay,Sunday,Thursday,Tuesday,Wendesday,...,Mo_9,Mo_10,Mo_11,Mo_12,Video,Status,Photo,Category_1,Category_2,Category_3
0,1.013546,0.0,100,0,0,0,0,0,0,1,...,0,0,0,1,0,0,1,0,1,0
1,1.013546,0.0,164,0,0,0,0,0,1,0,...,0,0,0,1,0,1,0,0,1,0
2,1.013546,0.0,80,0,0,0,0,0,1,0,...,0,0,0,1,0,0,1,0,0,1
4,1.013546,0.0,393,0,1,0,0,0,0,0,...,0,0,0,1,0,0,1,0,1,0
5,1.013546,0.0,186,0,0,0,1,0,0,0,...,0,0,0,1,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,-2.288209,0.0,84,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
496,-2.514389,0.0,75,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,1,0
497,-2.514389,0.0,115,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,1,0,0
498,-2.514389,0.0,136,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,1


##### Data splitting into Train and Test

I will use 80% for training and 20% for testing

In [14]:
df.columns

Index(['Page total likes', 'Paid', 'Total Interactions', 'Friday', 'Monday',
       'Saturay', 'Sunday', 'Thursday', 'Tuesday', 'Wendesday', 'hr__1',
       'hr__2', 'hr__3', 'hr__4', 'hr__5', 'hr__6', 'hr__7', 'hr__8', 'hr__9',
       'hr__10', 'hr__11', 'hr__12', 'hr__13', 'hr__14', 'hr__15', 'hr__16',
       'hr__17', 'hr__18', 'hr__19', 'hr__20', 'hr__22', 'hr__23', 'Mo_1',
       'Mo_2', 'Mo_3', 'Mo_4', 'Mo_5', 'Mo_6', 'Mo_7', 'Mo_8', 'Mo_9', 'Mo_10',
       'Mo_11', 'Mo_12', 'Video', 'Status', 'Photo', 'Category_1',
       'Category_2', 'Category_3'],
      dtype='object')

In [15]:
x = df.drop(['Total Interactions'],axis=1).values
y = df['Total Interactions'].values

In [16]:
x_train,x_test,y_train, y_test = train_test_split(x, y, test_size=0.2,random_state=50)

#### Linear regression model

In [17]:
linear_model = linear_model.LinearRegression()
linear_model.fit(x_train,y_train)

LinearRegression()

#### Predicting the values

In [18]:
test_pred = linear_model.predict(x_test)
train_pred = linear_model.predict(x_train)

In [19]:
test_score = r2_score(y_pred=test_pred,y_true=y_test)
train_score = r2_score(y_pred=train_pred,y_true=y_train)

print("R2 Score for the test",test_score)
print("R2 Score for the test",train_score)


R2 Score for the test -0.04600489945708097
R2 Score for the test 0.2534190647826525


R2 score of the testing values are very low since I will not preceed to check other accuracy metrics. 

#### Elasticnet Regression

In [20]:
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error

#### Finding Best alpha
To figure out what alpha value is good for the model, Define multiple values and check the performance accuracy.

In [21]:
alphas = [0.00001,0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 1]

In [22]:
for a in alphas:
    model = ElasticNet(alpha=a).fit(x,y)   
    score = model.score(x, y)
    pred_y = model.predict(x)
    mse = mean_squared_error(y, pred_y)   
    print("Alpha:{0:.4f}, R2:{1:.2f}, MSE:{2:.2f}, RMSE:{3:.2f}"
       .format(a, score, mse, np.sqrt(mse)))

Alpha:0.0000, R2:0.24, MSE:6319.42, RMSE:79.49
Alpha:0.0010, R2:0.24, MSE:6352.60, RMSE:79.70
Alpha:0.0100, R2:0.22, MSE:6478.01, RMSE:80.49
Alpha:0.1000, R2:0.19, MSE:6792.30, RMSE:82.42
Alpha:0.3000, R2:0.15, MSE:7108.63, RMSE:84.31
Alpha:0.5000, R2:0.12, MSE:7308.82, RMSE:85.49
Alpha:0.7000, R2:0.11, MSE:7453.96, RMSE:86.34
Alpha:1.0000, R2:0.09, MSE:7612.50, RMSE:87.25


Since 0.00001 Alpha value gives the highest R2 value and lowest RMSE value I will choose 0.00001 as the alpha

In [23]:
# appltying the elasticnet model
model = ElasticNet(alpha=0.001).fit(x_train, y_train)

# getting the prediciton using the test data
y_pred = model.predict(x_test)

# getting the r2 value 
test_score = r2_score(test_pred,y_test)

# getting mse value
mse = mean_squared_error(y_test, y_pred)

#printing the evaluations
print("R2:{0:.3f}, MSE:{1:.2f}, RMSE:{2:.2f}"
      .format(score, mse, np.sqrt(mse)))

R2:0.088, MSE:6981.22, RMSE:83.55


**Elasticnet regression also gives poor performance with very low R2 values**

Since both models performe poorly I will use `lazypredict` library to find out whether there are any suitable regression model.

Since kaggle has some environment issues, `lazypredict` cannot be insalled. Instead I have use `colab`.

Link to the colab notebook [https://colab.research.google.com/drive/1X40KhEpmfFnKZ6jQZeClVygMqvzyHP39?usp=sharing](http://)

Output from the `lazypredict`

|Model|Adjusted R-Squared|R-Squared|RMSE|Time Taken|
|---|---|---|---|---|
|LGBMRegressor|-0\.9054203107636525|0\.1436313210051|75\.69876729419335|0\.04472231864929199|
|HistGradientBoostingRegressor|-0\.944846451077221|0\.1259117073810243|76\.47792067583984|2\.002009630203247|
|OrthogonalMatchingPursuitCV|-0\.9892691998525984|0\.10594642703254009|77\.34641543725061|0\.0550999641418457|
|ElasticNetCV|-1\.0433264308818218|0\.08165104230030484|78\.39029315801712|0\.18924832344055176|
|TweedieRegressor|-1\.0455125272511059|0\.08066852707815464|78\.43221571173859|0\.017499923706054688|
|GeneralizedLinearRegressor|-1\.0455125272511059|0\.08066852707815464|78\.43221571173859|0\.07887840270996094|
|BayesianRidge|-1\.0512975591075597|0\.0780685127606473|78\.54304673972142|0\.03608226776123047|
|GradientBoostingRegressor|-1\.0571989125407177|0\.07541621908282359|78\.65594537511086|0\.1712944507598877|
|LarsCV|-1\.0592350358939964|0\.07450110746337246|78\.6948608143903|0\.1414165496826172|
|LassoLarsCV|-1\.0592350358939964|0\.07450110746337246|78\.6948608143903|0\.21893906593322754|
|LassoCV|-1\.0662602124860667|0\.07134372472536332|78\.82898211979632|0\.7464988231658936|
|ElasticNet|-1\.0787034862783895|0\.06575124212207206|79\.0659847659957|0\.034683942794799805|
|LassoLarsIC|-1\.080950350206861|0\.0647414156373659|79\.10870430987792|0\.07171750068664551|
|RandomForestRegressor|-1\.1090926322470396|0\.05209319899009468|79\.64183162630422|0\.4792003631591797|
|OrthogonalMatchingPursuit|-1\.1257787276668245|0\.0445938302621014|79\.95625430889052|0\.0174102783203125|
|LassoLars|-1\.1354819967646854|0\.04023281044283811|80\.13852956535352|0\.019057750701904297|
|NuSVR|-1\.2091506901320535|0\.007123285333908602|81\.50909715821058|0\.04314875602722168|
|SVR|-1\.2141920697901236|0\.004857496723539967|81\.60204784280369|0\.03908586502075195|
|Lasso|-1\.2398980173133332|-0\.0066957381183518905|82\.07436567321173|0\.019980907440185547|
|DummyRegressor|-1\.2680245152803158|-0\.019336860800141853|82\.58806378237104|0\.035079002380371094|
|RidgeCV|-1\.3391042507030377|-0\.05128280930473594|83\.87223214037938|0\.018326520919799805|
|Ridge|-1\.3506950238889872|-0\.05649214556808402|84\.0797779770072|0\.013875246047973633|
|SGDRegressor|-1\.3887783548926946|-0\.07360824938997501|84\.75812481188892|0\.026975393295288086|
|Lars|-1\.392229762659932|-0\.07515944389210416|84\.81933376819629|0\.08117961883544922|
|BaggingRegressor|-1\.4122205268396844|-0\.08414405700659966|85\.17299454639364|0\.0678553581237793|
|AdaBoostRegressor|-1\.426518551354715|-0\.09057013544032122|85\.42504577790923|0\.07617306709289551|
|HuberRegressor|-1\.4819944478233018|-0\.1155031226172143|86\.39603566176267|0\.06706976890563965|
|PoissonRegressor|-1\.5846709160299866|-0\.16164984990111742|88\.16496515493365|0\.07880735397338867|
|KNeighborsRegressor|-1\.703371403280065|-0\.21499838349665845|90\.16671965494437|0\.06117820739746094|
|XGBRegressor|-1\.7250993988277616|-0\.22476377475405007|90\.52834611041212|0\.20466399192810059|
|PassiveAggressiveRegressor|-1\.7264522942698073|-0\.22537181764935155|90\.55081505038989|0\.04274606704711914|
|LinearSVR|-1\.7887722661578724|-0\.25338079377881906|91\.57985206071567|0\.033075571060180664|
|MLPRegressor|-2\.1040476540842867|-0\.39507759734125236|96\.61790187354006|1\.0745947360992432|
|ExtraTreeRegressor|-2\.2432549154090804|-0\.45764265861082265|98\.76065343377729|0\.055632591247558594|
|ExtraTreesRegressor|-2\.488805802956522|-0\.5680026080703469|102\.43109378314335|0\.44019579887390137|
|DecisionTreeRegressor|-3\.0725741100776878|-0\.8303703865517698|110\.66949042582202|0\.040213584899902344|
|GaussianProcessRegressor|-5\.75294639373141|-2\.0350320870702965|142\.5082684906778|0\.08386015892028809|
|KernelRidge|-7\.158950923467616|-2\.666944235266344|156\.64284871767074|0\.07264161109924316|
|RANSACRegressor|-6\.025562679894729e+23|-2\.7081180583796532e+23|42568879270502\.2|0\.4028961658477783|
|LinearRegression|-1\.0738820118514468e+25|-4\.826436008321109e+24|179709773203379\.84|0\.0637514591217041|
|TransformedTargetRegressor|-1\.0738820118514468e+25|-4\.826436008321109e+24|179709773203379\.84|0\.01807999610900879|

According to the lazypredict all the models perform poorly. LGM regressor perform well relatively.  
Since there is a significant different between R2 and Adjusted R2 values, some variables do not actually increase to the model fit. 

#### LGBM Regressor

I will implement LGBM model since it can perform better than other models

Importing the library

In [24]:
import lightgbm as lgb

Model Implementation

In [25]:
gbm = lgb.LGBMRegressor(num_leaves=31,
                        learning_rate=0.05,
                        n_estimators=20)
gbm.fit(x_train, y_train,
        eval_set=[(x_test, y_test)],
        eval_metric='l1',
        callbacks=[lgb.early_stopping(5)])


Training until validation scores don't improve for 5 rounds
Did not meet early stopping. Best iteration is:
[20]	valid_0's l1: 63.04	valid_0's l2: 5794.33


LGBMRegressor(learning_rate=0.05, n_estimators=20)

Predicting the values

In [26]:
y_pred = gbm.predict(x_test, num_iteration=gbm.best_iteration_)

rmse_test = mean_squared_error(y_test, y_pred) ** 0.5
print('The RMSE of prediction is' , rmse_test)
R2_score = r2_score(y_test,y_pred)
print('The R2 value of prediction is' , R2_score)

The RMSE of prediction is 76.12051906255309
The R2 value of prediction is 0.1340623107923048


#### Future Work
We can check the feature importance and check whether the model performs better with less features. 