<a href="https://colab.research.google.com/github/Stephanie-DS/takehome-stephanie-workingcopy/blob/main/Models_from_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
#import data

url = 'https://raw.githubusercontent.com/Pivotal-Life-Sciences-DS/takehome-stephanie/main/askscience_data.csv'
df = pd.read_csv(url)
df = df[df.score!=df.score.max()]
df = df[~df.tag.isin(['Meta', 'META'])]


In [None]:
# Start adding extra cols

df['title_len'] = df['title'].str.len()
df['body_len'] = df['body'].fillna("").str.len()

# author post count
authorcount = df.groupby('author').size().sort_values(ascending=False)
authorcount = authorcount.to_frame(name='author_post_count')
authorcount.iloc[0]=0 # I want to remove counts for Deleted users
authorcount.iloc[1]=0 # i want to neutralize the mods' counts, too
df['authorcount'] = df['author'].replace(authorcount.index, authorcount['author_post_count'])

#is an AMA
df['is_ama'] = df['title'].str.contains("AMA").astype('float')

#is a megathread
df['is_megathread'] = df['title'].str.contains("Megathread", case=False).astype('float')

# has no capitalization
df['no_capitalization'] = df['title'].str.islower().astype('float')

# add tags as one-hot encoded columns
df_tags = pd.get_dummies(df, columns=['tag'])

df.columns

df= df[['title_len', 'body_len', 'authorcount', 'is_ama', 'is_megathread', 'no_capitalization', 'score']]
df_tags.head()
df = df_tags.drop(['title', 'body', 'datetime', 'author', 'upvote_ratio', 'url'], axis = 1)

# Very Basic Model
I'd like to make a very basic regression model just to take the features for a spin.


In [None]:
X = df.drop(['score'], axis =1)

y = df['score']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)


print(X_train.shape)
print(y_train.shape)

(3151, 37)
(3151,)


In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression().fit(X_train, y_train)
print("Training set score:, {:.2f}".format(lr.score(X_train, y_train)))
print("Test set score:, {:.2f}".format(lr.score(X_test, y_test)))
#print("lr.coef_:", lr.coef_)

Training set score:, 0.66
Test set score:, 0.66


In [None]:
from sklearn.linear_model import Ridge
ridge = Ridge().fit(X_train, y_train)
print("Training set score:, {:.2f}".format(ridge.score(X_train, y_train)))
print("Test set score:, {:.2f}".format(ridge.score(X_test, y_test)))


Training set score:, 0.66
Test set score:, 0.66


In [None]:
y_pred = ridge.predict(X_test)
df_ridgetest = pd.DataFrame(data = {"y_test":y_test, "y_pred":y_pred})
#print(df_ridgetest.head(20))

In [None]:
from sklearn.tree import DecisionTreeRegressor

regr_1 = DecisionTreeRegressor(max_depth= 3)
regr_2 = DecisionTreeRegressor(max_depth= 5)
regr_1.fit(X_train, y_train)
regr_2.fit(X_train, y_train)

y_1 = regr_1.predict(X_test)
y_2 = regr_2.predict(X_test)

print("Training set score:, {:.2f}".format(regr_1.score(X_train, y_train)))
print("Test set score:, {:.2f}".format(regr_1.score(X_test, y_test)))

print("Training set score:, {:.2f}".format(regr_2.score(X_train, y_train)))
print("Test set score:, {:.2f}".format(regr_2.score(X_test, y_test)))



Training set score:, 0.83
Test set score:, 0.59
Training set score:, 0.88
Test set score:, 0.63


In [None]:
df_treetest = pd.DataFrame(data = {"y_test":y_test, "y_pred":y_1})
#print(df_treetest.head(20))

In [None]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(random_state=0, max_depth =5)
rfr.fit(X_train, y_train)
print("Training set score:, {:.2f}".format(rfr.score(X_train, y_train)))
print("Test set score:, {:.2f}".format(rfr.score(X_test, y_test)))

Training set score:, 0.91
Test set score:, 0.80


In [None]:
from sklearn.model_selection import cross_val_score
# Define hyperparameter search space
search_criterion = {
    'criterion':['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
    'max_depth':[2,3,4,5,6,7,8],
    'min_samples_split': [2, 3, 4, 5, 6, 7],
    'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9]
}

In [None]:
max_score = 0
best_parameters = {}

# Run grid search over hyperparameter space
for criterion in search_criterion['criterion']:
    for max_depth in search_criterion['max_depth']:
      for min_samples_split in search_criterion['min_samples_split']:
        for min_samples_leaf in search_criterion['min_samples_leaf']:
          regr = RandomForestRegressor(
              criterion = criterion,
              max_depth = max_depth,
              min_samples_split = min_samples_split,
              min_samples_leaf = min_samples_leaf,
              random_state = 12,
          )

          # Get a series of 5 cross validation scores for R^2
          myscore = rfr.score(X_test, y_test)


          # Compare mean balanced accuracy scores at the present parameter set
          # with the previous best set. If the current set outperforms
          # the prior winner, it becomes the new winner.
          if max_score < myscore:
            max_score = myscore
            best_parameters = {
                'criterion': criterion,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf
            }

In [None]:
best_parameters

{'criterion': 'squared_error',
 'max_depth': 2,
 'min_samples_split': 2,
 'min_samples_leaf': 1}

In [None]:
max_score

0.8043938088513618

In [None]:
# Finally, let's train a single instance of the model and
# look again at the feature importances in a single trained
# instance of this model. There has been some reordering,
# but no changes major enough to seem impossible or obviously wrong.


regr = RandomForestRegressor(
            criterion = best_parameters['criterion'],
            max_depth = best_parameters['n_estimators'],
            min_samples_split = best_parameters['min_samples_split'],
            min_samples_leaf = best_parameters['min_samples_leaf'],
            random_state = 12,
        )

regr.fit(X_train, y_train)

y_pred = regr.predict(X_test)


print("Training set score:, {:.2f}".format(regr.score(X_train, y_train)))
print("Test set score:, {:.2f}".format(regr.score(X_test, y_test)))

feature_importances = pd.DataFrame({'features':sk_dataset['frame'].columns[0:-2],
                                    'importances':regr.feature_importances_})

feature_importances.sort_values('importances', ascending=False)