In [1]:
# Big General Imports
import pandas as pd
import numpy as np
import re

# Processing Imports
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

# Basic Model Imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

# Scoring Imports
from sklearn.metrics import accuracy_score, confusion_matrix

# Neural Net Imports
from tensorflow import keras
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras as keras

# Recycled Imports
#import math
#import nltk
#import matplotlib.pyplot as plt
#import seaborn as sns
#from sklearn.metrics import classification_report
#from sklearn.decomposition import LatentDirichletAllocation



  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Data Prep

In [2]:
df_runway_top    = pd.read_csv("./data/renttherunway_first.csv").drop("Unnamed: 0", axis = 1)

df_runway_bottom = pd.read_csv("./data/renttherunway_last.csv").drop("Unnamed: 0", axis = 1)

df_modcloth = pd.read_json('./data/modcloth_final_data.json', lines = True)


In [3]:
df_all = pd.concat([df_runway_top[["review_text", "fit"]],
                    df_runway_bottom[["review_text", "fit"]],
                    df_modcloth[["review_text", "fit"]]
                   ]).dropna().reset_index(drop = True)
df_all.tail()

Unnamed: 0,review_text,fit
268542,Cute jacket!,fit
268543,It's a beautiful jacket. I love how it's knit ...,small
268544,I love this blazer. It is a great office piece...,fit
268545,I love this blazer!! I wore it yesterday and g...,fit
268546,I love this piece. I'm really happy with it!,fit


In [4]:
df_all.to_csv("./data/temp_all_reviews.csv", index_label=False)

# Loading Data
df_modcloth = pd.read_json('./data/modcloth_final_data.json', lines = True)
print("Prior to DF Cleaning:")
print(df_modcloth["fit"].value_counts()) # Prior to cleaning

# Filter for just Large and Small reviews
df_modcloth = df_modcloth[
    (df_modcloth["fit"] == "large") | 
    (df_modcloth["fit"] == "small")]

# Map Fit for Better Machine Learning
df_modcloth["fit"] = df_modcloth["fit"].map({"small": 0, "large": 1})

# Removing NA's & Resetting Index
df_modcloth = df_modcloth[["review_text", "fit"]].dropna().reset_index(drop = True)
X = df_modcloth["review_text"]
y = df_modcloth["fit"]

print("--------")
print("Post DF Cleaning")
print(df_modcloth["fit"].value_counts())

In [5]:
X = df_all["review_text"]
y = df_all["fit"].map({"small": -1, "fit":0, "large": 1})

In [7]:
# Init Bag-of-Words Processing
def preprocess(text):
    text = re.sub(r'[^a-zA-Z]',' ', text.lower())
    tokens = word_tokenize(text)
    lemmer = WordNetLemmatizer()
    stop_words = stopwords.words("english")
    return " ".join([lemmer.lemmatize(word) for word 
                     in tokens if len(word) > 1 and not word in stop_words])
tfidf = TfidfVectorizer(analyzer = "word",
                       min_df = 7,
                       preprocessor = preprocess,
                       stop_words = 'english')

# Fit transform features AKA The Documents
X = tfidf.fit_transform(X)
X.shape

  % sorted(inconsistent)


(268547, 9254)

In [8]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Random Forest Modeling

In [10]:

# Initialize and train the model
model_rf = RandomForestClassifier(random_state=1)
model_rf.fit(X_train, y_train)

# Make predictions
y_pred = model_rf.predict(X_test)

# Evaluate the model
acc_rf = accuracy_score(y_test, y_pred)
conf_mat_rf = confusion_matrix(y_test, y_pred)

# Print Results
print("Accuracy: ", acc_rf)
print("Confusion Matrix:\n", conf_mat_rf)

NameError: name 'acc_rf' is not defined

In [12]:
# Print Results
print("Accuracy: ", acc)
print("Confusion Matrix:\n", conf_mat)

Accuracy:  0.7787562837460436
Confusion Matrix:
 [[ 1645  5607   145]
 [  295 38325   270]
 [  119  5447  1857]]


In [13]:
conf_mat

array([[ 1916,  5322,   159],
       [  454, 38059,   377],
       [  176,  5322,  1925]], dtype=int64)

In [14]:
# Initialize Logistic Regression model
model_lr = LogisticRegression(random_state=1, max_iter = 1000)
# Train the model
model_lr.fit(X_train, y_train)
# Make predictions
y_pred_lr = model_lr.predict(X_test)
# Calculate accuracy and confusion matrix for Logistic Regression
acc_lr = accuracy_score(y_test, y_pred_lr)
conf_mat_lr = confusion_matrix(y_test, y_pred_lr)

# Print the results
print(f"Logistic Regression Accuracy: {acc_lr}")
print(f"Confusion Matrix:\n{conf_mat_lr}\n")


Logistic Regression Accuracy: 0.780320238316887
Confusion Matrix:
[[ 2653  4342   402]
 [ 1315 36382  1193]
 [  372  4175  2876]]



In [15]:
# Initialize Naive Bayes model
model_nb = MultinomialNB()
# Train the model
model_nb.fit(X_train, y_train)
# Make predictions
y_pred_nb = model_nb.predict(X_test)
# Calculate accuracy and confusion matrix for Naive Bayes
acc_nb = accuracy_score(y_test, y_pred_nb)
conf_mat_nb = confusion_matrix(y_test, y_pred_nb)

# Print the results
print(f"Naive Bayes Accuracy: {acc_nb}")
print(f"Confusion Matrix:\n{conf_mat_nb}\n")

Naive Bayes Accuracy: 0.757680134053249
Confusion Matrix:
[[ 3297  3453   647]
 [ 2579 34148  2163]
 [  722  3451  3250]]



In [None]:
# Initialize Support Vector Machine model
model_svm = SVC(random_state=1)
# Train the model
model_svm.fit(X_train, y_train)
# Make predictions
y_pred_svm = model_svm.predict(X_test)
# Calculate accuracy and confusion matrix for Support Vector Machine
acc_svm = accuracy_score(y_test, y_pred_svm)
conf_mat_svm = confusion_matrix(y_test, y_pred_svm)

# Print the results
print(f"Support Vector Machine Accuracy: {acc_svm}")
print(f"Confusion Matrix:\n{conf_mat_svm}")


In [17]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid_rf = {
#    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
#    'bootstrap': [True, False]
}

# Initialize GridSearchCV
grid_search_rf = GridSearchCV(estimator=model_rf, param_grid=param_grid_rf, 
                              cv=3, n_jobs=2, verbose=2, scoring='accuracy')

# Fit the grid search to the data
grid_search_rf.fit(X_train, y_train)

# Print the best parameters
print("Best Parameters:", grid_search_rf.best_params_)

# Use the best estimator for predictions
model_rf_best = grid_search_rf.best_estimator_
y_pred_best = model_rf_best.predict(X_test)

# Evaluate the best model
acc_best = accuracy_score(y_test, y_pred_best)
conf_mat_best = confusion_matrix(y_test, y_pred_best)

# Print Results
print("Best Accuracy: ", acc_best)
print("Best Confusion Matrix:\n", conf_mat_best)


Fitting 3 folds for each of 12 candidates, totalling 36 fits


TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.


In [None]:
from sklearn.ensemble import VotingClassifier

# Initialize other classifiers to include in the ensemble
model_lr = LogisticRegression(random_state=1)
model_nb = MultinomialNB()
model_svc = SVC(probability=True, random_state=1)

# Create a list of tuples with classifier name and classifier object
classifiers = [
    ('rf', model_rf_best),
    ('lr', model_lr),
    ('nb', model_nb),
    ('svc', model_svc)
]

# Initialize the VotingClassifier with soft voting
ensemble_model = VotingClassifier(estimators=classifiers, voting='soft')

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_ensemble = ensemble_model.predict(X_test)
acc_ensemble = accuracy_score(y_test, y_pred_ensemble)
conf_mat_ensemble = confusion_matrix(y_test, y_pred_ensemble)

# Print results
print("Ensemble Accuracy: ", acc_ensemble)
print("Ensemble Confusion Matrix:\n", conf_mat_ensemble)


In [23]:
# Assuming 'vec' is the CountVectorizer instance you used to fit your training data
new_text_vectorized = cvec.transform("this was super duper short")

# Now you can predict using your trained Random Forest classifier 'model_rf'
prediction = model_rf.predict(new_text_vectorized)


ValueError: Iterable over raw text documents expected, string object received.

# Neural Net Processing

In [19]:
import tensorflow as tf

# Convert data to float32
X_train_tf = X_train.values.astype('float32')
X_test_tf = X_test.values.astype('float32')
y_train_tf = y_train.values.astype('float32')
y_test_tf = y_test.values.astype('float32')

# Build and compile the model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train_tf.shape[1],)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_tf, y_train_tf, epochs=5, batch_size=32, validation_split=0.2)

# Evaluate the model
metrics = model.evaluate(X_test_tf, y_test_tf)


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 16660 samples, validate on 4166 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [28]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(random_state=42)
model_rf.fit(X, series_fit)


RandomForestClassifier(random_state=42)

In [29]:
# For new text prediction
new_text = "this top is too small"
new_text_processed = vec.transform([new_text])
prediction_rf = model_rf.predict(new_text_processed)


In [30]:
prediction_rf.

array([0], dtype=int64)

In [31]:
# ... (previous code for loading data, preprocessing, and model training)

# Define the function to interpret the prediction
def get_fit_label(prediction):
    label_map = {0: "small", 1: "large"}
    return label_map.get(prediction[0], "Unknown")

# Example usage
new_text = "this top is too small"
new_text_processed = vec.transform([new_text])
prediction_rf = model_rf.predict(new_text_processed)

# Get and print the prediction
predicted_label = get_fit_label(prediction_rf)
print(f"The predicted fit for the review is: {predicted_label}")


The predicted fit for the review is: small


In [33]:
# Make a prediction with probability estimates
new_text = "kind of small but also a bit wide in other places"
new_text_processed = vec.transform([new_text])
prediction_prob = model_rf.predict_proba(new_text_processed)

# Scale the probability of the 'large' class from [0, 1] to [-1, 1]
# 'large' is assumed to be the second class (index 1)
scaled_prediction = prediction_prob[0][1] * 2 - 1  # This converts the scale

print(f"The scaled fit for the review is: {scaled_prediction:.2f}")


The scaled fit for the review is: -0.28


In [36]:
prediction_prob

array([[0.64, 0.36]])

## Recycling

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline



# Model Prep
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MinMaxScaler, Normalizer#, Binarizer
from sklearn.decomposition import PCA

# Regression Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB#, BernoulliNB, GaussianNB
#from sklearn.svm import SVC

In [None]:
X = PolynomialFeatures().fit_transform(X)

In [None]:
X = StandardScaler().fit_transform(X)
X = MinMaxScaler().fit_transform(X)
X = Normalizer().fit_transform(X)

In [None]:
X = PCA(n_components= 
        components_or_alteredfeatures_in_model).fit_transform(X)  # Is poly features wrapped up in PCA already???

In [None]:
VarianceThreshold # Remove features that have the same value in more that XX% of the column
# Used to cut out potentially high variance risk columns by identifying the ones with, for example: 
# 10% == 1, and 90% == 0

In [None]:
# Prepping (for Model Consumption)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify = y, random_state=42)

# Model Prep
from sklearn.model_selection import train_test_split, cross_val_score

# Scalers
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MinMaxScaler, Normalizer#, Binarizer
X = StandardScaler().fit_transform(X)
X = MinMaxScaler().fit_transform(X)
X = Normalizer().fit_transform(X)

pd.DataFrame(X_scaled, columns=features).head()




# Feature Engineering (New Features)


# Bootstrapping (Not our code, copied from DSI)

def bootstrap_sample(values, statistic, num_samples): 
    bootstrap_statistics = []
    
    for _ in range(num_samples):
        subset = np.random.choice(values, size = 1000, replace =True)
        stat = statistic(subset)
        bootstrap_statistics.append(stat)
        
    return bootstrap_statistics 

#==============================================
from sklearn.preprocessing import PolynomialFeatures

# Poly Features
poly = PolynomialFeatures(include_bias=False)
features = ['col1', 'col2', 'col3']
X = df[features]
y = df['target']

X_poly = poly.fit_transform(X)
X_poly[:5, :]

poly.get_feature_names(features)

pd.DataFrame(X_poly, columns=poly.get_feature_names(features)).head()

# Copied over from the PCA lession

from sklearn.decomposition import PCA
pca = PCA()

pca = pca.fit(X_train_scaled, y_train)

pca_train = pca.transform(X_train)
pca_test = pca.transform(X_test_scaled)

#==============================================


explained_variance = pca.explained_variance_ratio_
#print(explained_variance)
cumulative_explained_variance = np.cumsum(explained_variance)
for i in cumulative_explained_variance:
    print(i)

#==============================================
X = PCA(n_components= 
        components_or_alteredfeatures_in_model).fit_transform(X)  # Is poly features wrapped up in PCA already???

VarianceThreshold # Remove features that have the same value in more that XX% of the column
# Used to cut out potentially high variance risk columns by identifying the ones with, for example: 
# 10% == 1, and 90% == 0








#====================================
# Random Bullcrap



# Computing Standard Deviation with list comprehension & no loops (don't know why it's "pop"
def pop_std(the_column):
    return round(((sum([(x - the_column.mean())**2 for x in the_column]))/len(the_column))**(1/2))
pop_std(test_df['Sat_Total'])

# Alternative remaping code
df['column'] = [float(x.replace('example','')) for x in df['column']]


# Using .items() to make a column into a dictionary
dict = {}
for key, value in df['column'].items():
    dict[key] = value

# Scales
# Scaled
tv_mean = df['TV'].mean()
((df['TV'] - tv_mean) / df['TV'].std(ddof=0)).head() #Manually



----------------------------------------------
pd.scatter_matrix(df) # Alt sns.pairplot?
df.mean().sort_values().plot(style = '.') # Good for organizing things when otherwise random.

In [None]:
# Model prep & metric imports
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MinMaxScaler, Normalizer
#, Binarizer
from sklearn.decomposition import PCA

# Imports
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# Regression model imports
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBRegressor


#==========================================



def test_model(df, test_model, features, target = 'Target'):
    model = test_model
    X_train, X_test, y_train, y_test = train_test_split(df[features], df[target]) 
    model = model.fit(X_train,y_train)

    print('Train:', model.score(X_train,y_train))
    print('Test: ',  model.score(X_test,y_test))
    return


# Simple Model Forms

features = ['columns_etc']
X = df[features]
y = df['target']

model = Model(**params).fit(Xtrain,ytrain)
model.score(Xtrain,ytrain)
model.score(Xtest,ytest)

#======================================


def run_model(model, X_train, X_test, y_train, y_test, results_dataframe, save = True, 
              rando_state = 76, is_neural_network = False, 
              NN_epochs = 10, NN_batch_size = 32, NN_verbose = 0):

    #X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = rando_state)

model      = model.fit(X_train, y_train)
pred_train = model.predict(X_train)
pred_test  = model.predict(X_test)
dict_model = {'model' : re.findall(r'^[^@]+\(', str(model))[0].strip("("),
         'parameters' : model.get_params()}
        

    
    # Adding non-model dependant information to dict_model
    dict_model['time']               = datetime.fromtimestamp(time()).strftime('%m/%d/%Y, %H:%M:%S')
    dict_model['features']           = [feature for feature in X.columns]
    dict_model['train_RMSE']         = np.sqrt(metrics.mean_squared_error(y_train, pred_train))
    #dict_model['train_Median_error'] = metrics.median_absolute_error(y_train, pred_train)
    #dict_model['train_R_squared']    = metrics.r2_score(y_train, pred_train)
    dict_model['test_RMSE']          = np.sqrt(metrics.mean_squared_error(y_test, pred_test))
    #dict_model['test_Median_error']  = metrics.median_absolute_error(y_test, pred_test)
    #dict_model['test_R_squared']     = metrics.r2_score(y_test, pred_test)
    
    # Printing current results
    print(dict_model['model'] + ' Train')
    print('RMSE : ' + str(dict_model['train_RMSE']))
    #print('Median Abs Error : ' + str(dict_model['train_Median_error']))
    #print('R Squared        : ' + str(dict_model['train_R_squared']))
    print('\n' + dict_model['model'] + ' Test')
    print('RMSE : ' + str(dict_model['test_RMSE']))
    #print('Median Abs Error : ' + str(dict_model['test_Median_error']))
    #print('R Squared        : ' + str(dict_model['test_R_squared']))
    
    # Saving current results
    results_dataframe = results_dataframe.append(dict_model, ignore_index=True)
    if save == True:
        results_dataframe.to_csv('./data/modeling_results_' + datetime.fromtimestamp(time()).strftime('%m_%d_%Y')
                          , index = False)
    
    return results_dataframe





# Regressors
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, AdaBoostRegressor
#, ExtraTreesRegressor
from sklearn.svm import SVR


scaled_df = pd.DataFrame(StandardScaler().fit_transform(df),
                         columns = df.columns)
scaled_df.head()

X = scaled_df.drop(columns = ['target'])
y = scaled_df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.2)
lin_reg    = LinearRegression().fit(X_train, y_train)
knn_reg    = KNeighborsRegressor().fit(X_train, y_train)
cart_reg   = DecisionTreeRegressor().fit(X_train, y_train)
bag_reg    = BaggingRegressor().fit(X_train, y_train)
randof_reg = RandomForestRegressor().fit(X_train, y_train)
ada_reg    = AdaBoostRegressor().fit(X_train, y_train)
SV_reg     = SVR().fit(X_train, y_train)
models     = [
    lin_reg,
    knn_reg,
    cart_reg,
    bag_reg,
    randof_reg,
    ada_reg,
    SV_reg    
]


# Classifiers
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC

# Classifications:
X = scaled_df.drop(columns = ['e401k', 'p401k'])
y = [1 if scaled_df['e401k'][i] > 0 else 0 for i in range(scaled_df.shape[0])]
Xr_train, Xr_test, yr_train, yr_test = train_test_split(X, y,
                                                    test_size = 0.2)
LogisticRegression().fit(X_train, y_train)
KNeighborsClassifier().fit(X_train, y_train)
DecisionTreeClassifier().fit(X_train, y_train)
BaggingClassifier().fit(X_train, y_train)
RandomForestClassifier().fit(X_train, y_train)
AdaBoostClassifier().fit(X_train, y_train)
SVC().fit(X_train, y_train)




# Multi NB
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
model = nb.fit(X_train, y_train)
predictions = model.predict(X_test)

model.score(X_train, y_train)
model.score(X_test, y_test)

from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)



# XGBoost

from xgboost import XGBRegressor
X = df[[]]
y = df[]
df_results = run_model(XGBRegressor(objective='reg:squarederror'), X, y, df_results)

import xgboost as xgb
from xgboost import XGBRegressor
xgb_regressor=XGBRegressor(max_depth=7, 
                           n_estimators=500, 
                           objective="reg:linear", 
                           min_child_weight = 6,
                           subsample = 0.87,
                           colsample_bytree = 0.50,
                           scale_pos_weight = 1.0,                       
                           learning_rate=0.1)

xgb_regressor.fit(X_train, y_log, eval_metric=RMSLE)

In [None]:
## Needed:
 1.) Imports
 2.) Classification Metrics
 #==================================

# For general Regression.
def RMSE(y_real, y_hat):  # Goal=0, Avg Distance
    return np.sqrt(metrics.mean_squared_error(y_real, y_hat))

#==================================
# For Neural Network
def rmse(y_true, y_pred):
    return backend.sqrt(backend.mean(backend.square(y_pred - y_true), axis=-1))

#====================================
# All Other Important Regression Metrics

def Reg_metrics(y_real, y_hat):
    print('RMSE            :',np.sqrt(metrics.mean_squared_error(y_real, y_hat)))   # Goal=0, Avg Distance
    print('Median Abs Error:',metrics.median_absolute_error(y_real,y_hat))          # Goal=0, Median Distance
    print('R Squared       :',metrics.r2_score(y_real, y_hat))          # Goal=1, Percent model can explain
    return

#====================================


def rmse_score(model, 
               X_train = X_train,
               X_test = X_test,
               y_train = y_train,
               y_test = y_test):
    train_score = mean_squared_error(y_true = y_train,
                                     y_pred = model.predict(X_train)) ** 0.5
    test_score = mean_squared_error(y_true = y_test,
                                    y_pred = model.predict(X_test)) ** 0.5
    print(str(model)[0:20])
    print("Train:" + str(train_score))
    print("Test :" + str(test_score))
    print('')
    return

#====================================
def f1_score(model, 
               X_train = Xr_train,
               X_test = Xr_test,
               y_train = yr_train,
               y_test = yr_test):
    train_score = f1_score(y_train,
                           model.predict(X_train))
    test_score  = f1_score(y_test,
                           model.predict(X_test))
    print(str(model)[0:20])
    print("Train:" + str(train_score))
    print("Test :" + str(test_score))
    print('')
    return

#====================================
def run_model(model, X_train, X_test, y_train, y_test, results_dataframe, save = True, 
              rando_state = 76, is_neural_network = False, 
              NN_epochs = 10, NN_batch_size = 32, NN_verbose = 0):

    #X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = rando_state)

model      = model.fit(X_train, y_train)
pred_train = model.predict(X_train)
pred_test  = model.predict(X_test)
dict_model = {'model' : re.findall(r'^[^@]+\(', str(model))[0].strip("("),
         'parameters' : model.get_params()}
        

    
    # Adding non-model dependant information to dict_model
    dict_model['time']               = datetime.fromtimestamp(time()).strftime('%m/%d/%Y, %H:%M:%S')
    dict_model['features']           = [feature for feature in X.columns]
    dict_model['train_RMSE']         = np.sqrt(metrics.mean_squared_error(y_train, pred_train))
    dict_model['test_RMSE']          = np.sqrt(metrics.mean_squared_error(y_test, pred_test))
    
    # Printing current results
    print(dict_model['model'] + ' Train')
    print('RMSE : ' + str(dict_model['train_RMSE']))
    print('\n' + dict_model['model'] + ' Test')
    print('RMSE : ' + str(dict_model['test_RMSE']))
    
    # Saving current results
    results_dataframe = results_dataframe.append(dict_model, ignore_index=True)
    if save == True:
        results_dataframe.to_csv('./data/modeling_results_' + datetime.fromtimestamp(time()).strftime('%m_%d_%Y')
                          , index = False)
    
    return results_dataframe





#====================================

# Alternmative Measures for above Function
# Adding non-model dependant information to dict_model
    dict_model['time']               = datetime.fromtimestamp(time()).strftime('%m/%d/%Y, %H:%M:%S')
    dict_model['features']           = [feature for feature in X.columns]
    dict_model['train_RMSE']         = np.sqrt(metrics.mean_squared_error(y_train, pred_train))
    #dict_model['train_Median_error'] = metrics.median_absolute_error(y_train, pred_train)
    #dict_model['train_R_squared']    = metrics.r2_score(y_train, pred_train)
    dict_model['test_RMSE']          = np.sqrt(metrics.mean_squared_error(y_test, pred_test))
    #dict_model['test_Median_error']  = metrics.median_absolute_error(y_test, pred_test)
    #dict_model['test_R_squared']     = metrics.r2_score(y_test, pred_test)
    
    # Printing current results
    print(dict_model['model'] + ' Train')
    print('RMSE : ' + str(dict_model['train_RMSE']))
    #print('Median Abs Error : ' + str(dict_model['train_Median_error']))
    #print('R Squared        : ' + str(dict_model['train_R_squared']))
    print('\n' + dict_model['model'] + ' Test')
    print('RMSE : ' + str(dict_model['test_RMSE']))
    #print('Median Abs Error : ' + str(dict_model['test_Median_error']))
    #print('R Squared        : ' + str(dict_model['test_R_squared']))

##  Unsupervised Modeling

In [None]:
#import gensim
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN

In [None]:
silhouette_score(the_X,the_kmodel.labels_) # -1 = Bad 0 = Meh 1 = Good

In [None]:
# Modeling

# DBScan
#model = DBSCAN(eps = 3, min_samples = 3)
#test_df['y_hat'] = model.fit_predict(X)

# K Means
#k_model = KMeans(n_clusters=3)
#model = k_model.fit(df[['x_axis', 'y_axis']])

# Visualizing based on 2 columns
plt.figure(figsize = (5,5))
plt.scatter(the_df['x'],the_df['y'], c = the_df['DBS_y'], alpha=0.7);

In [None]:
centroids = pd.DataFrame(the_kmodel.cluster_centers_, columns = ['x_axis', 'y_axis'])
centroids

In [None]:
def compare_clusters(df,
                     n_clusters_kmeans  = 3,
                     n_clusters_agg     = 3,
                     eps_dbscan         = 3,
                     min_samples_dbscan = 3):
    
    fig,ax  = plt.subplots(nrows = 1, ncols = 4, figsize=(24, 7)) # Init graph backgrounds & their positions
    kmeans  = KMeans(n_clusters = n_clusters_kmeans)
    agclus  = AgglomerativeClustering(n_clusters = n_clusters_agg)
    dbscan  = DBSCAN(eps = eps_dbscan, min_samples = min_samples_dbscan)
    model_tuples = [(False,  0, 'label'),
                    (kmeans, 1, 'K_Means'), 
                    (agclus, 2, 'Agg_clust'), 
                    (dbscan, 3, 'DBScan')]

    for tup in model_tuples:
        # [0] = Model
        # [1] = Axis Number
        # [2] = label string
        if tup[0] == False:
            pass
        else:
            model = tup[0].fit(df.iloc[:,0:2])
            df[tup[2]] = model.labels_
        colors = plt.cm.Paired(np.linspace(0, 1, len(df[tup[2]].unique())))
        for label, color in zip(df[tup[2]].unique(), colors):
            X_  = df[df[tup[2]] == label]
            ax[tup[1]].scatter(X_.iloc[:, 0], X_.iloc[:, 1], s = 70,
                            color = color, label = label, alpha = .9)

        ax[tup[1]].set_title(tup[2], fontsize=30, color = 'Black')
        ax[tup[1]].legend(loc = "lower right")

In [None]:
df_results = pd.DataFrame(columns=['model', 'features', 'parameters',
    'train_RMSE', 'train_Median_error', 'train_R_squared',
    'test_RMSE',  'test_Median_error',  'test_R_squared', 'time'])

def run_model(model, X, y, results_dataframe, save = True, 
              rando_state = 76, is_neural_network = False, 
              NN_epochs = 10, NN_batch_size = 32, NN_verbose = 0):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = rando_state)

    if is_neural_network == False:
        # Performing normal running of model.
        model      = model.fit(X_train, y_train)
        pred_train = model.predict(X_train)
        pred_test  = model.predict(X_test)
        dict_model = {'model' : re.findall(r'^[^@]+\(', str(model))[0].strip("("),
                 'parameters' : model.get_params()}
        
    elif is_neural_network == True:
        # Running model for Neural Networks
        results = model.fit(X_train, y_train, 
                    epochs= NN_epochs, 
                    batch_size = NN_batch_size,
                    verbose = NN_verbose,
                    validation_data = (X_test,y_test))
        pred_train = model.predict(X_train)
        pred_test  = model.predict(X_test)
        dict_model = {'model' : "Neural Network",
                 'parameters' : {'layers' : [layer.get_config()['units'] for layer in results.model.layers],
                                 'batch_size' : NN_batch_size,
                                 'epochs'     : NN_epochs}}

    else:
        print("A non-boolian value was passed to is_neural_network.  This is an error.")
        return
    
    # Adding non-model dependant information to dict_model
    dict_model['time']               = datetime.fromtimestamp(time()).strftime('%m/%d/%Y, %H:%M:%S')
    dict_model['features']           = [feature for feature in X.columns]
    dict_model['train_RMSE']         = np.sqrt(metrics.mean_squared_error(y_train, pred_train))
    dict_model['train_Median_error'] = metrics.median_absolute_error(y_train, pred_train)
    dict_model['train_R_squared']    = metrics.r2_score(y_train, pred_train)
    dict_model['test_RMSE']          = np.sqrt(metrics.mean_squared_error(y_test, pred_test))
    dict_model['test_Median_error']  = metrics.median_absolute_error(y_test, pred_test)
    dict_model['test_R_squared']     = metrics.r2_score(y_test, pred_test)
    
    # Printing current results
    print(dict_model['model'] + ' Train')
    print('RMSE             : ' + str(dict_model['train_RMSE']))
    print('Median Abs Error : ' + str(dict_model['train_Median_error']))
    print('R Squared        : ' + str(dict_model['train_R_squared']))
    print('\n' + dict_model['model'] + ' Test')
    print('RMSE             : ' + str(dict_model['test_RMSE']))
    print('Median Abs Error : ' + str(dict_model['test_Median_error']))
    print('R Squared        : ' + str(dict_model['test_R_squared']))
    
    # Saving current results
    results_dataframe = results_dataframe.append(dict_model, ignore_index=True)
    if save == True:
        results_dataframe.to_csv('./data/modeling_results_' + datetime.fromtimestamp(time()).strftime('%m_%d_%Y')
                          , index = False)
    
    return results_dataframe

In [None]:
X = df[[features]]
y = df[target]
df_results = run_model(LogisticRegression(), X, y, df_results)

In [None]:
df_results = run_model(KNeighborsClassifier(), X, y, df_results)
df_results = run_model(DecisionTreeClassifier(), X, y, df_results)
df_results = run_model(RandomForestClassifier(), X, y, df_results)
df_results = run_model(AdaBoostClassifier(), X, y, df_results)
df_results = run_model(ExtraTreesClassifier(), X, y, df_results)
df_results = run_model(GradientBoostingClassifier(), X, y, df_results)
df_results = run_model(SVR(), X, y, df_results)

In [None]:
params = {
    'param':[list_of_options]
}
Model_gridsearch = GridSearchCV(Model(), params, cv=5, verbose=1, n_jobs=2,)


df_results = run_model(Model_gridsearch, X, y, df_results)

In [17]:
# Load the data
df_mod_test = pd.read_csv('./data/mod_test.csv')
df_mod_test_target = pd.read_csv('./data/mod_test_target.csv')

# Drop the 'Unnamed: 0' columns
df_mod_test = df_mod_test.drop(columns=['Unnamed: 0'])
df_mod_test_target = df_mod_test_target.drop(columns=['Unnamed: 0'])



In [31]:
df_runway_top    = pd.read_csv("./data/renttherunway_first.csv").drop("Unnamed: 0", axis = 1)
df_runway_top    = df_runway_top[["review_text", "fit"]].dropna().reset_index(drop = True)
df_runway_top.head()

df_runway_bottom = pd.read_csv("./data/renttherunway_last.csv").drop("Unnamed: 0", axis = 1)
df_runway_bottom = df_runway_bottom[["review_text", "fit"]].dropna().reset_index(drop = True)
df_runway_bottom.head()

df_modcloth = pd.read_json('./data/modcloth_final_data.json', lines = True)
df_modcloth = df_modcloth[["review_text", "fit"]].dropna().reset_index(drop = True)
df_modcloth.head()

Unnamed: 0,review_text,fit
0,An adorable romper! Belt and zipper were a lit...,fit
1,I rented this dress for a photo shoot. The the...,fit
2,This hugged in all the right places! It was a ...,fit
3,I rented this for my company's black tie award...,fit
4,I have always been petite in my upper body and...,fit
