# Draft analysis 

---

Group name:

---


## Introduction

*This section includes an introduction to the project motivation, data, and research question. Include a data dictionary* 

## Setup

In [None]:
#Import all necessary libraries
%matplotlib inline
import numpy as np
import pandas as pd
import altair as alt
import seaborn as sns
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.inspection import permutation_importance
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy import stats
import xgboost as xg
import pickle
alt.data_transformers.disable_max_rows()

## Data

## Import data

In [None]:
# import Data into Pandas Frame
df = pd.read_csv("../data/interim/TransformedData",delimiter=",", index_col=0)

In [None]:
df.head()

### Data structure

In [None]:
df.info()

### Data corrections

In [None]:
cat_vars = [
    "gender", 
    "condtn",
    "match",
    "samerace",
    "age_o",
    "race_o",
    "dec_o",
    "met_o",
    "field_cd",
    "race",
    "zipcode",
    "goal",
    "date",
    "go_out",
    "career_c",
    "dec",
    "met",
    "length",
    "numdat_2",
    "date_3",
]

float_vars = [
    "int_corr",
    "pf_o_att",
    "pf_o_sin",
    "pf_o_int",
    "pf_o_fun",
    "pf_o_amb",
    "pf_o_sha",
    "income",
    "attr1_1",
    "sinc1_1",
    "intel1_1",
    "fun1_1",
    "amb1_1",
    "shar1_1",
    "attr4_1",
    "sinc4_1",
    "intel4_1",
    "fun4_1",
    "amb4_1",
    "shar4_1",
    "attr2_1",
    "sinc2_1",
    "intel2_1",
    "fun2_1",
    "amb2_1",
    "shar2_1"
]

int_vars = [
    "attr_o",
    "sinc_o",
    "intel_o",
    "fun_o",
    "amb_o",
    "shar_o",
    "like_o",
    "prob_o",
    "age",
    "imprace",
    "imprelig",
    "sports",
    "tvsports",
    "excersice",
    "dining",
    "museums",
    "art",
    "hiking",
    "gaming",
    "clubbing",
    "reading",
    "tv",
    "theater",
    "movies",
    "concerts",
    "music",
    "shopping",
    "yoga",
    "exhappy",
    "attr3_1",
    "sinc3_1",
    "intel3_1",
    "fun3_1",
    "amb3_1",
    "attr5_1",
    "sinc5_1",
    "intel5_1",
    "fun5_1",
    "amb5_1",
    "attr",
    "sinc",
    "intel",
    "fun",
    "amb",
    "shar",
    "like",
    "prob",
    "attr1_s",
    "sinc1_s",
    "intel1_s",
    "fun1_s",
    "amb1_s",
    "shar1_s",
    "attr4_s",
    "sinc4_s",
    "intel4_s",
    "fun4_s",
    "amb4_s",
    "satis_2",
    "iid",
    "id",
    "idg",
    "wave",
    "round",
    "order",
    "partner",
    "pid",
    "expnum",
    "you_call",
    "them_cal",
    "numdat_3",
    "num_in_3",
    "position",
    "positin1",
]

str_vars = [
    "field",
    "from",
    "career"
]

unused_vars = [
    "undergrd",
    "mn_sat",
    "tuition"
]

In [None]:
df[cat_vars]=df[cat_vars].astype("category",copy=False)
df[float_vars]=df[float_vars].astype("float",copy=False)
df[str_vars]=df[str_vars].astype("str",copy=False)

In [None]:
df.rename(columns={"amb":"Ambitionsness","attr":"Attractiveness","sinc":"Sincerity","fun":"Funniness","intel":"Intelligence","shar":"Percieved shared interests","int_corr":"Correlation of interests"},inplace=True)

In [None]:
variables = ['Attractiveness', 'Sincerity', 'Intelligence', 'Funniness', 'Ambitionsness', 'Percieved shared interests', 'prob','like']
df = df[variables]

In [None]:
g=sns.displot(
    data=df.isna().melt(value_name="NaN"),
    y="variable",
    hue="NaN",
    multiple="fill",
)

Drop NAs

In [None]:
df.isna().sum()

In [None]:
df.dropna(inplace=True)
df.reset_index(inplace=True, drop=True)

In [None]:
g=sns.displot(
    data=df.isna().melt(value_name="NaN"),
    y="variable",
    hue="NaN",
    multiple="fill",
)

### Variable lists

In [None]:
y_label = variables.pop()
features = variables

X = df[features]
y = df[y_label]

In [None]:
y_label

In [None]:
features

### Data splitting

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    shuffle=True,
                                                    random_state=42)

## Analysis

### Descriptive statistics

In [None]:
df.describe().T

### Exploratory data analysis

In [None]:
alt.Chart(df).mark_bar().encode(
    alt.X(alt.repeat("column"), type="quantitative", bin=True),
    y='count()',
).properties(
    width=150,
    height=150
).repeat(
    column=features
)

In [None]:
sns.pairplot(df, y_vars='like')

In [None]:
sns.violinplot(df)

In [None]:
sns.boxplot(df)

### Relationships

In [None]:
# inspect correlation between outcome and possible predictors
corr = df.corr()
corr['like'].sort_values(ascending=False)

In [None]:
corr.style.background_gradient(cmap='Blues')

## Model Regression

### Select model

In [None]:
# select the linear regression model
reg = LinearRegression()


### Training and validation Regression

In [None]:
scores = cross_val_score(reg, X_train, y_train, cv=5, scoring='neg_mean_squared_error') *-1
# store cross-validation scores
df_scores = pd.DataFrame({"lr": scores})

# reset index to match the number of folds
df_scores.index += 1

# print dataframe
df_scores.style.background_gradient(cmap='Blues')

In [None]:
alt.Chart(df_scores.reset_index()).mark_line(
     point=alt.OverlayMarkDef()
).encode(
    x=alt.X("index", bin=False, title="Fold", axis=alt.Axis(tickCount=5)),
    y=alt.Y("lr", aggregate="mean", title="Mean squared error (MSE)")
)

In [None]:
df_scores.describe().T

### Fit model

In [None]:
# Fit the model to the data
reg.fit(X_train, y_train)

In [None]:
# intercept
intercept = pd.DataFrame({
    "Name": ["Intercept"],
    "Coefficient":[reg.intercept_]}
    )

# make a slope table
slope = pd.DataFrame({
    "Name": features,
    "Coefficient": reg.coef_}
)

# combine estimates of intercept and slopes
table = pd.concat([intercept, slope], ignore_index=True, sort=False)

round(table, 3)

### Evaluation on test set

In [None]:
# obtain predictions
y_pred = reg.predict(X_test)

In [None]:
print(mean_absolute_error(y_test, y_pred).round(3))
mean_squared_error(y_test, y_pred, squared=False).round(3)
mean_squared_error(y_test, y_pred).round(3)
r2_score(y_test, y_pred).round(3)

In [None]:
importance = np.abs(reg.coef_)

df_imp = pd.DataFrame({"coeff": importance, 
                       "name": features,"reg":"MultiLinear"})
df_imp

### Save model



Save your model in the folder `models/`. Use a meaningful name and a timestamp.

In [None]:
folder = '../models/'
pkl_filename = 'clf_reg_20221222.pkl'

In [None]:
with open(folder + pkl_filename, 'wb') as file:
    pickle.dump(reg, file)

In [None]:
with open(folder + pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)

pickle_model

# Model Lasso


In [None]:
lasso = LassoCV(cv=5,random_state=0,max_iter=15000)
lasso.fit(X_train,y_train)

In [None]:
lasso.alpha_

In [None]:
# Set best alpha
lasso_best = Lasso(alpha=lasso.alpha_)
lasso_best.fit(X_train, y_train)

In [None]:
print(list(zip(lasso_best.coef_, X)))

In [None]:
print('R squared training set', round(lasso_best.score(X_train, y_train)*100, 2))
print('R squared test set', round(lasso_best.score(X_test, y_test)*100, 2))

In [None]:
mean_squared_error(y_test, lasso_best.predict(X_test))

In [None]:
importance = np.abs(lasso_best.coef_)

df_lasso = pd.DataFrame({"coeff": importance, 
                       "name": features,"reg":"Lasso"})
df_lasso
dfdisplay=pd.concat([df_imp,df_lasso])

In [None]:
plt.semilogx(lasso.alphas_, lasso.mse_path_, ":")
plt.plot(
    lasso.alphas_ ,
    lasso.mse_path_.mean(axis=-1),
    "k",
    label="Average across the folds",
    linewidth=2,
)
plt.axvline(
    lasso.alpha_, linestyle="--", color="k", label="alpha: CV estimate"
)

plt.legend()
plt.xlabel("alphas")
plt.ylabel("Mean square error")
plt.title("Mean square error on each fold")
plt.axis("tight")


# XG Boost

In [None]:
regxg= xg.XGBRegressor(eval_metric='rmsle',booster='gblinear')

In [None]:
regxgnolin= xg.XGBRegressor(eval_metric='rmsle')

In [None]:
param_grid = { "n_estimators": [100,500, 600, 700],
              "learning_rate": [0.01, 0.015,0.08,0.5]}

In [None]:
param_grid_nolinear = {"max_depth": [2,5, 8, 10],
              "n_estimators": [100,500, 600, 700],
              "learning_rate": [0.01, 0.015,0.08]}

In [None]:
searchnolinear = GridSearchCV(regxgnolin, param_grid_nolinear, cv=5).fit(X_train, y_train)

print("The best hyperparameters are ",searchnolinear.best_params_)

In [None]:
search = GridSearchCV(regxg, param_grid, cv=5).fit(X_train, y_train)

print("The best hyperparameters are ",search.best_params_)

In [None]:
regxg=xg.XGBRegressor(learning_rate = search.best_params_["learning_rate"],
                           n_estimators  = search.best_params_["n_estimators"],booster='gblinear')
                          # max_depth     = search.best_params_["max_depth"],booster='gblinear')

regxg.fit(X_train, y_train)

In [None]:
regxgnolin=xg.XGBRegressor(learning_rate = searchnolinear.best_params_["learning_rate"],
                           n_estimators  = searchnolinear.best_params_["n_estimators"],
                           max_depth = searchnolinear.best_params_["max_depth"])

regxgnolin.fit(X_train, y_train)

In [None]:
predictionsnolin = regxgnolin.predict(X_test)

In [None]:
predictions = regxg.predict(X_test)

In [None]:
xg.plot_importance(regxg)

In [None]:
xg.plot_importance(regxgnolin)

Can only display coefficients for the Linear Part of XGBoost

In [None]:
regxg.coef_
importance = np.abs(regxg.coef_)

df_xgboost = pd.DataFrame({"coeff": importance, 
                       "name": features,"reg":"XGBoost"})
df_xgboost
dfdisplay=pd.concat([df_imp,df_lasso,df_xgboost
])

# Model Comparison

In [None]:
Compdf = pd.DataFrame({"Type": ["MAE_Regression", "RMSE_Regression", "MSE_Regression", "R2_Regression", "MAE_Lasso", "RMSE_Lasso", "MSE_Lasso", "R2_Lasso","MAE_XGBOOST", "RMSE_XGBOOST", "MSE_XGBOOST","R2_XGBOOST","MAE_XGBOOSTNOLIN", "RMSE_XGBOOSTNOLIN", "MSE_XGBOOSTNOLIN","R2_XGBOOSTNOLIN"], "Values": [mean_absolute_error(y_test, y_pred).round(3), mean_squared_error(y_test, y_pred, squared=False).round(3), mean_squared_error(y_test, y_pred).round(
    3), r2_score(y_test, y_pred).round(3), mean_absolute_error(y_test, lasso_best.predict(X_test)).round(3), mean_squared_error(y_test, lasso_best.predict(X_test), squared=False).round(3), mean_squared_error(y_test, lasso_best.predict(X_test)).round(3), r2_score(y_test, lasso_best.predict(X_test)).round(3),
    mean_absolute_error(y_test, predictions).round(3), mean_squared_error(y_test, predictions, squared=False).round(3), mean_squared_error(y_test, predictions).round(
    3), r2_score(y_test, predictions).round(3),mean_absolute_error(y_test, predictionsnolin).round(3), mean_squared_error(y_test, predictionsnolin, squared=False).round(3), mean_squared_error(y_test, predictionsnolin).round(
    3), r2_score(y_test, predictionsnolin).round(3)]})


In [None]:
Compdf

In [None]:
sns.barplot(Compdf,y="Type",x="Values")

In [None]:
sns.barplot(dfdisplay,y="name",x="coeff",hue="reg")

In [None]:
dfscatter_Multilin= pd.DataFrame({'True Values':y_test,"Predicted Values":reg.predict(X_test),"type":"Multilinear"})
dfscatter_Lasso= pd.DataFrame({'True Values':y_test,"Predicted Values":lasso_best.predict(X_test),"type":"Lasso"})
dfscatter_XGboost=pd.DataFrame({'True Values':y_test,"Predicted Values":predictions,"type":"XGBOOST"})
dfscatter_XGboostnolin=pd.DataFrame({'True Values':y_test,"Predicted Values":predictionsnolin,"type":"XGBOOSTnolin"})
dfscatter=pd.concat([dfscatter_Multilin,dfscatter_Lasso,dfscatter_XGboost,dfscatter_XGboostnolin])

In [None]:
sns.boxplot(dfscatter,hue="type",x="True Values",y="Predicted Values")

# Conclusions