# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
import requests
import random
from IPython.core.display import display, HTML
from sklearn.feature_selection import mutual_info_regression,SelectKBest,f_regression,VarianceThreshold
from sklearn.preprocessing import MinMaxScaler
from collections import OrderedDict

display(HTML("<style>.container { width:100% !important; }</style>"))
%matplotlib inline
np.random.seed(123)

# Reading Data

In [None]:
curr_dir = os.getcwd()
#Reading all test csvs and merging them
testfiles = glob.glob(curr_dir+"/Testing/TestSet/*.csv")
test_df = pd.DataFrame()
for testfile in testfiles:
    if test_df.shape[0] == 0:
        test_df = pd.read_csv(testfile,header=None)
    else:
        test_df = pd.concat([test_df,pd.read_csv(testfile,header=None)])
        
# reading train csv        
train_df = pd.read_csv(curr_dir+"/Training/Features_Variant_1.csv",header=None)

#Constructing feature Labels
features = ["PageLikes","PageCheckins","PageTalkingAbout","PageCategory"]
features.extend(["derived_"+str(i) for i in range(1,26)])
features.extend(["CC1","CC2","CC3","CC4","CC5","BaseTime","PostLength","PostShareCount","PostPromotionStatus","Hhours"])
features.extend(["PostPublishedWeekday"+str(i) for i in range(1,8)])
features.extend(["BaseTimeWeekDay"+str(i) for i in range(1,8)])
features.extend(["Target"])

#renaming train and test column names
train_df.columns = features
test_df.columns = features
data_df = pd.concat([train_df,test_df],ignore_index=True)



# Selecting 11 features from dataset using statistical analysis

In [None]:

data_x = data_df.iloc[:,:-1]
data_y = data_df.iloc[:,-1:]


data_x_scaled = pd.DataFrame(MinMaxScaler().fit_transform(data_x),columns=data_x.columns.values)
data_x_bestfit = SelectKBest(f_regression,k=41).fit(data_x_scaled, data_y)
data_x_filtered = data_x_scaled.iloc[:,data_x_bestfit.get_support()]
data_x_filtered = data_x_filtered.loc[:,((data_x_filtered.corr()>0.5).sum()>1)==False].copy()

# Splitting data into train test in ration of 70 : 30

In [None]:
def split_and_return_coefficients(data_x,data_y,frac,target_label,intercept_label):
  train_data = data_x.sample(frac=0.7,random_state=111)
  train_target = data_y.loc[train_data.index.tolist()][target_label]

  test_data = data_x.drop(train_data.index.values)
  test_target = data_y.loc[test_data.index.tolist()][target_label]

  #Building coefficients and initializing with random values
  coefficients = ['b0']
  coefficients.extend(['b'+str(i) for i in train_data.columns.values])


  train_data.insert(0,intercept_label,[1]*len(train_data.index))
  test_data.insert(0,intercept_label,[1]*len(test_data.index))
  
  return coefficients,train_data,test_data,train_target,test_target

coefficients,train_data,test_data,train_target,test_target = split_and_return_coefficients(data_x_filtered,data_y,0.7,"Target","Intercept")

# Parameter estimation using batch gradient descent

In [None]:
#n_samples is number of samples in training set
n_samples = len(train_data.index)

#Initialize random coefficient values
coeff_values_dict = OrderedDict((coeff,np.random.uniform(0,0.01)) for coeff in coefficients)
coeff_values = list(coeff_values_dict.values())
print("Initial coeff parameter values are:")
coeff_values_dict

In [None]:
def getHypothesisValue(coeff_values_h,training_hypoth):
    if not isinstance(training_hypoth,pd.DataFrame):
        print("df expected found something else")
    else:
        hypothesis_value = np.dot(training_hypoth,coeff_values_h)
        return hypothesis_value
            
        

In [None]:
def getMSEValue(example_matrix,example_target,coeff_values_mse):
    error = np.subtract(getHypothesisValue(coeff_values_mse,example_matrix),example_target)
    squared_error = np.dot(np.transpose(error),error)
    mean_squared_error = squared_error/(2*len(example_target))
    return mean_squared_error
    

In [None]:
def gradientUpdate(train_g,train_target_g,coeff_values_g,alpha_g):
    error = np.subtract(getHypothesisValue(coeff_values_g,train_g),train_target_g)
    updated_coeff_values = coeff_values_g - (alpha_g/len(train_target_g))*(np.dot(train_g.T.as_matrix(),error))
    return updated_coeff_values
    
    

In [None]:
def converge(converge_train,converge_train_target,converge_test,converge_test_target,converge_coeff_values,converge_alpha,converge_threshold,converge_max_iter):
    n_samples_c = len(converge_train)
    converge_trigger = 0
    increasing_trigger = 0
    train_loss = [getMSEValue(converge_train,converge_train_target,converge_coeff_values)]
    test_loss = [getMSEValue(converge_test,converge_test_target,converge_coeff_values)]
    while(True):
      converge_coeff_values = gradientUpdate(converge_train,converge_train_target,converge_coeff_values,converge_alpha)
      error_change = (train_loss[-1]-getMSEValue(converge_train,converge_train_target,converge_coeff_values))/train_loss[-1]
      if(converge_trigger == 10 or increasing_trigger==100):
        break
      if ((error_change>0 and error_change<=converge_threshold) or len(train_loss)>=converge_max_iter):
        converge_trigger +=1
      elif error_change<0:
        increasing_trigger +=1
      else:
        converge_trigger = 0
        increasing_trigger = 0
        tl = getMSEValue(converge_train,converge_train_target,converge_coeff_values)
        ttl = getMSEValue(converge_test,converge_test_target,converge_coeff_values)
        train_loss.append(tl)
        test_loss.append(ttl)
          
    output_dict = {}
    output_dict["iterations"] = len(train_loss)-1
    output_dict["train_loss"] = train_loss
    output_dict["test_loss"] = test_loss  
    output_dict["coeff_values"] = converge_coeff_values

    return output_dict

# Part1 : Varying alpha

In [None]:
alpha_values = [0.0001,0.001,0.01,0.05,0.1,0.5,0.8,1,2]
def get_best_alpha(alpha_values,train_data_a,train_target_a,test_data_a,test_target_a,coeff_values_a,threshold_a,max_iter_a):
  results = []
  for alpha_a in alpha_values:
    results.append(converge(train_data_a,train_target_a,test_data_a,test_target_a,coeff_values_a,alpha_a,threshold_a,max_iter_a))
  return results

def alpha_plot(results):
    for i,j in enumerate(results):
      plt.subplot(121)
      plt.plot(j["train_loss"],label=alpha_values[i])
      plt.legend()
      plt.xlabel("iterations")
      plt.ylabel("Training Error")
      plt.title("Train Error convergence for various alpha")
      plt.subplot(122)
      plt.plot(j["test_loss"],label=alpha_values[i])
      plt.xlabel("iterations")
      plt.ylabel("Test Error")
      plt.title("Test Error convergence for various alphas")
      plt.legend()
      plt.subplots_adjust(left=0.5, right=3)
alpha_results = get_best_alpha(alpha_values,train_data,train_target,test_data,test_target,coeff_values,0.000001,30000)
alpha_plot(alpha_results)

In [None]:
train_losses = []
test_losses = []
for i,j in enumerate(alpha_results):
  train_losses.append(j["train_loss"][-1])
  test_losses.append(j["test_loss"][-1])

plt.subplot(111)
plt.plot(train_losses,label="Train Errors")
plt.legend()
plt.xlabel("alpha")
plt.ylabel("Error")
plt.title("Train and Test Error for various alpha")
plt.plot(test_losses,label="Test Errors")
plt.legend()
plt.xticks(range(len(alpha_values)+1),alpha_values)
plt.subplots_adjust(left=0.5, right=3)

We choose 0.5 as best alpha because it converges faster than the alpha values below it and alpha values above 0.5 the function do not converge to minimum

# Part 2

In [None]:
threshold_list = [0.001,0.0001,0.00001,0.000001,0.0000001]
alpha = 0.5
output = []
for thres in threshold_list:
  result = converge(train_data,train_target,test_data,test_target,coeff_values,0.5,thres,15000)
  output.append(result)
  
plt.subplot(121)
plt.plot([result["train_loss"][-1] for result in output])
plt.xlabel("Threshold Value as percent change")
plt.ylabel("Train Error")
plt.xticks(range(len(threshold_list)+1),threshold_list)
plt.title("Train Error variation on Threshold Value")
plt.subplot(122)
plt.plot([result["test_loss"][-1] for result in output])
plt.xlabel("Threshold Value as percent change")
plt.ylabel("Test Error")
plt.xticks(range(len(threshold_list)+1),threshold_list)
plt.title("Test Error variation on Threshold Value")
plt.subplots_adjust(left=0.5, right=3)




In [None]:
plt.subplot(121)
plt.plot(output[3]["train_loss"],label="Train Error")
plt.legend()
plt.ylabel("Error")
plt.xlabel("Iterations")
plt.plot(output[3]["test_loss"],label="Test Error")
plt.legend()
plt.ylabel("Error")
plt.xlabel("Iterations")
plt.title("Train and Test loss vs iterations for best threshold")
plt.subplots_adjust(left=0.5, right=3)

# Part 3

We will randomly choose 5 features and compare error value of this model  with error values of original 11 feature model

In [None]:

#choosing five features randomly
final_ex= data_x_scaled[list(np.random.choice(features[:-1],5,replace=False))]
coefficients_ex,train_ex,test_ex,train_target_ex,test_target_ex = split_and_return_coefficients(final_ex,data_y,0.7,"Target","Intercept")


#Initialize random coefficient values
coeff_values_dict_ex = OrderedDict((coeff,np.random.uniform(0,0.1)) for coeff in coefficients_ex)
coeff_values_ex = list(coeff_values_dict_ex.values())

alpha_results = get_best_alpha(alpha_values,train_ex,train_target_ex,test_ex,test_target_ex,coeff_values_ex,0.000001,15000)
alpha_plot(alpha_results)



In [None]:
#choose best alpha
alpha_ex = 0.5


#choosing threshold Error value
threshold_ex = 0.000001


result_ex = converge(train_ex,train_target_ex,test_ex,test_target_ex,coeff_values_ex,alpha_ex,threshold_ex,15000)

print("random features are:",final_ex.columns.values)
print("Training error for five random features model is: ",result_ex["train_loss"][-1])
print("Test error for five random features model is: ", result_ex["test_loss"][-1] )

In [None]:

#original data
coefficients,train,test,train_target,test_target = split_and_return_coefficients(data_x_filtered,data_y,0.7,"Target","Intercept")



#Initialize random coefficient values
coeff_values_dict = OrderedDict((coeff,np.random.uniform(0,0.1)) for coeff in coefficients)
coeff_values = list(coeff_values_dict.values())


#choosing alpha
alpha = 0.5

#choosing threshold Error value
threshold = 0.000001


result = converge(train,train_target,test,test_target,coeff_values,alpha,threshold,15000)




print("Training error for five random features model is: ",result["train_loss"][-1])
print("Test error for five random features model is: ", result["test_loss"][-1] )

In [None]:
experiment_df = pd.DataFrame(index=["five_featues_random","11 featues"])
experiment_df["train_loss"] = [result_ex["train_loss"][-1],result["train_loss"][-1]]
experiment_df["test_loss"] = [result_ex["test_loss"][-1],result["test_loss"][-1]]
experiment_df

# Part 4

I think these features "PageLikes","PageCheckins","PageTalkingAbout","PostShareCount","Hhours" are important according to my intuition and we will findout how these features perform against random 5 features model and original 11 featuers model 

In [None]:
# chosing 5 best features
final_filtered_ex4= data_x_scaled[["PageLikes","PageCheckins","PageTalkingAbout","PostShareCount","Hhours"]]
final_transformed_ex4 = pd.DataFrame(MinMaxScaler().fit_transform(final_filtered_ex4),columns=final_filtered_ex4.columns)
coefficients_ex4,train_ex4,test_ex4,train_target_ex4,test_target_ex4 = split_and_return_coefficients(final_transformed_ex4,data_y,0.7,"Target","Intercept")



#Initialize random coefficient values
coeff_values_dict_ex4 = OrderedDict((coeff,np.random.uniform(0,0.1)) for coeff in coefficients_ex4)
coeff_values_ex4 = list(coeff_values_dict_ex4.values())


alpha_results = get_best_alpha(alpha_values,train_ex4,train_target_ex4,test_ex4,test_target_ex4,coeff_values_ex4,0.000001,15000)
alpha_plot(alpha_results)



In [None]:
#choosing alpha
alpha_ex4 = 0.5

#choosing threshold Error value
threshold_ex4 = 0.000001

result_ex4 = converge(train_ex4,train_target_ex4,test_ex4,test_target_ex4,coeff_values_ex4,alpha_ex4,threshold_ex4,15000)

print("Training error for five random features model is: ",result_ex4["train_loss"][-1])
print("Test error for five random features model is: ", result_ex4["test_loss"][-1] )

In [None]:
experiment_df.loc["5 best"] = [result_ex4["train_loss"][-1],result_ex4["test_loss"][-1]]
experiment_df

In [None]:
five_best = train_ex4
five_best["target"] = train_target_ex4.values

In [None]:
five_random = train_ex
five_random['target'] = train_target_ex


In [None]:
print("five random feature model")
five_random.corr()

In [None]:
print("five best features model")
five_best.corr()

# Final Equation with Parameters

In [None]:
for beta,feature in zip(result["coeff_values"],train.columns.values):
  print("{} * ".format(beta)+feature+" + ")