In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import syllables
import re
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
df = pd.read_csv('../Data/train.csv')

In [4]:
excerpts = df['excerpt']

In [5]:
def extractData(excerptColumn):
    i = 0
    excerptData = {}
    for entry in excerptColumn:
        countComplex = 0
        countSyllable = 0
        excerptData[i] = [len(entry.split()), len(re.split(r'\.!?', entry))]
        for word in entry.split():
            word = word.strip(',."\'!?;:')
            syllable = syllables.estimate(word)
            if ( syllable >= 3) and ((not word.endswith('ed')) or (not word.endswith('ing')) or (not word.endswith('es'))):
                countComplex += 1
            countSyllable += syllable
        excerptData[i].append(countSyllable)
        excerptData[i].append(countComplex)
        i += 1
    return excerptData

In [6]:
# excerptData = np.array(list(extractData(excerpts).values()))
# pd.DataFrame(excerptData).to_csv('../Data/Metrics.csv')

In [7]:
metric = pd.read_csv('../Data/Metrics.csv', header= None)
metric.columns = ["index","word_count","sentence_count", "syllable_count", "complex_word_count"]

In [8]:
metric = metric[["word_count","sentence_count", "syllable_count", "complex_word_count"]].copy()
metric = metric.sample(frac=1, random_state=2)

In [9]:
reg = LinearRegression()

In [10]:
metricArray = np.array(metric[1:])
targetValues = df['target'].values
reg.fit(metricArray,targetValues)
targetValues.min(), targetValues.max()

(-3.676267773, 1.711389827)

In [11]:
reg.coef_, reg.intercept_

(array([-0.00599576,  0.00297437,  0.00509838, -0.01320502]),
 -1.001517127358303)

In [12]:
predictions = reg.predict(metricArray)

In [13]:
mean_squared_error(targetValues,predictions, squared = False)

1.032307382052584

In [14]:
r2_score(targetValues, predictions)

0.002107372287326381

In [15]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(metricArray, targetValues)

DecisionTreeRegressor()

In [16]:
tree_predictions = tree_reg.predict(metricArray)

In [17]:
mean_squared_error(targetValues, tree_predictions, squared = False)

0.04591632636841679

In [18]:
r2_score(targetValues, tree_predictions)

0.9980257596839203

In [19]:
scores = cross_val_score(tree_reg, metricArray,targetValues, scoring="neg_mean_absolute_error", cv=10)
scores = - scores

In [20]:
scores, scores.mean(), scores.std()

(array([1.08034993, 1.24241771, 1.27379178, 1.34754905, 1.27904418,
        1.16577416, 1.41838702, 1.20862345, 1.09430452, 1.23553538]),
 1.2345777172159837,
 0.09949678811054201)

In [21]:
lin_scores = cross_val_score(reg, metricArray, targetValues, scoring="neg_mean_absolute_error", cv=10)

In [22]:
lin_scores = - lin_scores

In [23]:
lin_scores, lin_scores.mean(), lin_scores.std()

(array([0.61339215, 0.80647329, 0.84347656, 0.95213119, 0.85823995,
        0.95237545, 1.11346274, 0.90809287, 0.59665067, 0.91214851]),
 0.8556443370428788,
 0.148127292958199)

In [24]:
sdg_reg = SGDRegressor()
scaler = StandardScaler()
scaledMetric = scaler.fit_transform(metricArray)
sdg_reg.fit(scaledMetric, targetValues)

SGDRegressor()

In [25]:
sdg_scores = cross_val_score(sdg_reg, scaledMetric, targetValues, scoring="neg_mean_absolute_error", cv=10)

In [26]:
sdg_scores = - sdg_scores

In [27]:
sdg_scores, sdg_scores.mean(), sdg_scores.std()

(array([0.61531359, 0.80785917, 0.85157942, 0.95547094, 0.85730315,
        0.94403142, 1.11465186, 0.90719024, 0.59475775, 0.90984753]),
 0.8558005083868263,
 0.14784472329802575)

In [28]:
print(f"Linear Regression RMSE:{lin_scores.mean()}")
print(f"Linear Regression STD:{lin_scores.std()}")
print(f"SDG Regression RMSE:{sdg_scores.mean()}")
print(f"SDG Regression STD:{sdg_scores.std()}")
print(f"Decision Tree Regression RMSE:{scores.mean()}")
print(f"Decision Tree Regression STD:{scores.std()}")

Linear Regression RMSE:0.8556443370428788
Linear Regression STD:0.148127292958199
SDG Regression RMSE:0.8558005083868263
SDG Regression STD:0.14784472329802575
Decision Tree Regression RMSE:1.2345777172159837
Decision Tree Regression STD:0.09949678811054201


In [29]:
#random forests + gradient boosting + pipelining
#correlations between features - pearson correlation
#principle component analysis
#neural nets
#part of speech tagging (nltk, stanford)

In [30]:
gb_reg = GradientBoostingRegressor()
gb_reg.fit(scaledMetric, targetValues)
gb_scores = cross_val_score(gb_reg, scaledMetric, targetValues, scoring="neg_mean_absolute_error", cv=10)

In [31]:
gb_scores = - gb_scores

In [32]:
gb_scores, gb_scores.mean(), gb_scores.std()

(array([0.62275243, 0.80850988, 0.87037101, 0.97095408, 0.85866971,
        0.9683326 , 1.12582434, 0.91536938, 0.59582462, 0.91688109]),
 0.8653489156559095,
 0.15151239640100375)

In [33]:
rf_reg = RandomForestRegressor()
rf_reg.fit(scaledMetric, targetValues)
rf_scores = cross_val_score(rf_reg, scaledMetric, targetValues, scoring="neg_mean_absolute_error", cv=10)

In [34]:
rf_scores = - rf_scores

In [35]:
rf_scores, rf_scores.mean(), rf_scores.std()

(array([0.67851565, 0.82054508, 0.90893048, 1.02515126, 0.87525756,
        0.99171818, 1.12781536, 0.94855361, 0.63870512, 0.97130265]),
 0.8986494939821776,
 0.14412964305452344)