In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression

In [2]:
data=pd.read_csv('../Data/train.csv')
data.head()

Unnamed: 0,ID,Tag,Reputation,Answers,Username,Views,Upvotes
0,52664,a,3942.0,2.0,155623,7855.0,42.0
1,327662,a,26046.0,12.0,21781,55801.0,1175.0
2,468453,c,1358.0,4.0,56177,8067.0,60.0
3,96996,a,264.0,3.0,168793,27064.0,9.0
4,131465,c,4271.0,4.0,112223,13986.0,83.0


In [5]:
labelencoder_X = LabelEncoder()
data['Tag'] = labelencoder_X.fit_transform(data['Tag'])
data.drop(['ID','Username'], axis=1,inplace =True)
target = data['Upvotes']

In [6]:
features = [x for x in data.columns if x not in ['Upvotes']]

In [10]:
x_train, x_val, y_train, y_val = train_test_split(data[features], target,test_size = 0.22,random_state =205)

In [13]:
regression_model = LinearRegression()
regression_model.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [21]:
for idx, col_name in enumerate(x_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[idx]))

The coefficient for Tag is 7.285068247482572
The coefficient for Reputation is 0.03516475746306239
The coefficient for Answers is -37.66716536336018
The coefficient for Views is 0.01966756801540111


In [23]:
intercept = regression_model.intercept_
print("The intercept for our model is {}".format(intercept))

The intercept for our model is -397.4616793521459


In [24]:
regression_model.score(x_train, y_train)

0.2456687829647335

**Adding interaction terms to check if it can improve performance of the model**

In [25]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model

poly = PolynomialFeatures(degree=2, interaction_only=True)
X_train2 = poly.fit_transform(x_train)
X_test2 = poly.fit_transform(x_val)

poly_clf = linear_model.LinearRegression()

poly_clf.fit(X_train2, y_train)

y_pred = poly_clf.predict(X_test2)

#print(y_pred)

#In sample (training) R^2 will always improve with the number of variables!
print(poly_clf.score(X_train2, y_train))

0.9025591221428022


In [27]:
print(poly_clf.score(X_test2, y_val))

0.9160368790349769


Adding polynomial feature to the data significantly improved model performance

In [51]:
testdata=pd.read_csv('../Data/test.csv')
ids=testdata["ID"]

In [52]:
testdata['Tag'] = labelencoder_X.fit_transform(testdata['Tag'])
testdata.drop(['ID','Username'], axis=1,inplace =True)

In [53]:
X_test=poly.fit_transform(testdata[features])

In [54]:
y_pred_test = poly_clf.predict(X_test)

In [55]:
pred_test=abs(y_pred_test)


submission = pd.DataFrame({'ID': ids,
                           'Upvotes':pred_test
                           })

submission.to_csv("final_sub.csv",index=False)