In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import syllables
import re
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
df = pd.read_csv('../Data/train.csv')

In [4]:
excerpts = df['excerpt']

In [5]:
def extractData(excerptColumn):
    i = 0
    excerptData = {}
    for entry in excerptColumn:
        countComplex = 0
        countSyllable = 0
        excerptData[i] = [len(entry.split()), len(re.split(r'\.!?', entry))]
        for word in entry.split():
            word = word.strip(',."\'!?;:')
            syllable = syllables.estimate(word)
            if ( syllable >= 3) and ((not word.endswith('ed')) or (not word.endswith('ing')) or (not word.endswith('es'))):
                countComplex += 1
            countSyllable += syllable
        excerptData[i].append(countSyllable)
        excerptData[i].append(countComplex)
        i += 1
    return excerptData

In [6]:
# excerptData = np.array(list(extractData(excerpts).values()))
# pd.DataFrame(excerptData).to_csv('../Data/Metrics.csv')

In [7]:
metric = pd.read_csv('../Data/Metrics.csv', header= None)
metric.columns = ["index","word_count","sentence_count", "syllable_count", "complex_word_count"]

In [8]:
metric = metric[["word_count","sentence_count", "syllable_count", "complex_word_count"]].copy()

In [9]:
reg = LinearRegression()

In [10]:
metricArray = np.array(metric[1:])
targetValues = np.array(df['target'])
reg.fit(metricArray,targetValues)
targetValues.min(), targetValues.max()

(-3.676267773, 1.711389827)

In [11]:
reg.coef_, reg.intercept_

(array([-0.00375627,  0.06266718, -0.00243323, -0.03427505]),
 0.4570793308587172)

In [12]:
predictions = reg.predict(metricArray)

In [13]:
mean_squared_error(targetValues,predictions, squared = False)

0.8501610290279323

In [14]:
r2_score(targetValues, predictions)

0.32318786849128345

In [15]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(metricArray, targetValues)

DecisionTreeRegressor()

In [16]:
tree_predictions = tree_reg.predict(metricArray)

In [17]:
mean_squared_error(targetValues, tree_predictions, squared = False)

0.018539661240752757

In [18]:
r2_score(targetValues, tree_predictions)

0.9996781382732329

In [19]:
scores = cross_val_score(tree_reg, metricArray,targetValues, scoring="neg_mean_absolute_error", cv=10)
scores = - scores

In [20]:
scores, scores.mean(), scores.std()

(array([0.93300849, 1.01459611, 1.10450916, 0.9906704 , 0.91591577,
        0.89858007, 0.95331807, 0.89107984, 0.87937305, 0.96427949]),
 0.9545330442185677,
 0.06501035060144639)

In [21]:
lin_scores = cross_val_score(reg, metricArray, targetValues, scoring="neg_mean_absolute_error", cv=10)

In [22]:
lin_scores = - lin_scores

In [23]:
lin_scores, lin_scores.mean(), lin_scores.std()

(array([0.55086064, 0.76144612, 0.86098811, 0.76264975, 0.6856531 ,
        0.71731131, 0.75487651, 0.68710959, 0.51695606, 0.66356134]),
 0.6961412531563924,
 0.09707080249355003)

In [24]:
sdg_reg = SGDRegressor()
scaler = StandardScaler()
scaledMetric = scaler.fit_transform(metricArray)
sdg_reg.fit(scaledMetric, targetValues)

SGDRegressor()

In [25]:
sdg_scores = cross_val_score(sdg_reg, scaledMetric, targetValues, scoring="neg_mean_absolute_error", cv=10)

In [26]:
sdg_scores = - sdg_scores

In [27]:
sdg_scores, sdg_scores.mean(), sdg_scores.std()

(array([0.55180846, 0.76581613, 0.86632526, 0.76270447, 0.67498821,
        0.72849804, 0.75144077, 0.68760046, 0.51935923, 0.66321347]),
 0.6971754499320303,
 0.09799521595385519)

In [28]:
print(f"Linear Regression RMSE:{lin_scores.mean()}")
print(f"Linear Regression STD:{lin_scores.std()}")
print(f"SDG Regression RMSE:{sdg_scores.mean()}")
print(f"SDG Regression STD:{sdg_scores.std()}")
print(f"Decision Tree Regression RMSE:{scores.mean()}")
print(f"Decision Tree Regression STD:{scores.std()}")

Linear Regression RMSE:0.6961412531563924
Linear Regression STD:0.09707080249355003
SDG Regression RMSE:0.6971754499320303
SDG Regression STD:0.09799521595385519
Decision Tree Regression RMSE:0.9545330442185677
Decision Tree Regression STD:0.06501035060144639


In [29]:
#random forests + gradient boosting + pipelining
#correlations between features - pearson correlation
#principle component analysis
#neural nets
#part of speech tagging (nltk, stanford)

In [30]:
gb_reg = GradientBoostingRegressor()
gb_reg.fit(scaledMetric, targetValues)
gb_scores = cross_val_score(gb_reg, scaledMetric, targetValues, scoring="neg_mean_absolute_error", cv=10)

In [31]:
gb_scores = - gb_scores

In [32]:
gb_scores, gb_scores.mean(), gb_scores.std()

(array([0.57290069, 0.7278632 , 0.87937069, 0.77854077, 0.67124824,
        0.69043641, 0.72419452, 0.66293279, 0.53363464, 0.67387293]),
 0.6914994896879213,
 0.09262183336136393)

In [33]:
rf_reg = RandomForestRegressor()
rf_reg.fit(scaledMetric, targetValues)
rf_scores = cross_val_score(rf_reg, scaledMetric, targetValues, scoring="neg_mean_absolute_error", cv=10)

In [34]:
rf_scores = - rf_scores

In [35]:
rf_scores, rf_scores.mean(), rf_scores.std()

(array([0.60456004, 0.74071198, 0.90821539, 0.805936  , 0.70760406,
        0.68346489, 0.7510678 , 0.6718062 , 0.57736598, 0.71368573]),
 0.7164418082924289,
 0.09033241514201343)