In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import syllables
import re
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
df = pd.read_csv('../Data/train.csv')

In [4]:
excerpts = df['excerpt']

In [5]:
def extractData(excerptColumn):
    i = 0
    excerptData = {}
    for entry in excerptColumn:
        countComplex = 0
        countSyllable = 0
        excerptData[i] = [len(entry.split()), len(re.split(r'\.!?', entry))]
        for word in entry.split():
            word = word.strip(',."\'!?;:')
            syllable = syllables.estimate(word)
            if ( syllable >= 3) and ((not word.endswith('ed')) or (not word.endswith('ing')) or (not word.endswith('es'))):
                countComplex += 1
            countSyllable += syllable
        excerptData[i].append(countSyllable)
        excerptData[i].append(countComplex)
        i += 1
    return excerptData

In [6]:
# excerptData = np.array(list(extractData(excerpts).values()))
# pd.DataFrame(excerptData).to_csv('../Data/Metrics.csv')

In [7]:
metric = pd.read_csv('../Data/Metrics.csv', header= None)
metric.columns = ["index","word_count","sentence_count", "syllable_count", "complex_word_count"]

In [8]:
metric = metric[["word_count","sentence_count", "syllable_count", "complex_word_count"]].copy()

In [9]:
reg = LinearRegression()

In [10]:
metricArray = np.array(metric[1:])
targetValues = np.array(df['target'])
reg.fit(metricArray,targetValues)
targetValues.min(), targetValues.max()

(-3.676267773, 1.711389827)

In [11]:
reg.coef_, reg.intercept_

(array([-0.00375627,  0.06266718, -0.00243323, -0.03427505]),
 0.4570793308587172)

In [12]:
predictions = reg.predict(metricArray)

In [13]:
mean_squared_error(targetValues,predictions, squared = False)

0.8501610290279323

In [14]:
r2_score(targetValues, predictions)

0.32318786849128345

In [15]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(metricArray, targetValues)

DecisionTreeRegressor()

In [16]:
tree_predictions = tree_reg.predict(metricArray)

In [17]:
mean_squared_error(targetValues, tree_predictions, squared = False)

0.018539661240752757

In [18]:
r2_score(targetValues, tree_predictions)

0.9996781382732329

In [19]:
scores = cross_val_score(tree_reg, metricArray,targetValues, scoring="neg_root_mean_squared_error", cv=10)
scores = - scores

In [20]:
scores, scores.mean(), scores.std()

(array([1.1635825 , 1.25878281, 1.37862664, 1.2850316 , 1.14180619,
        1.13377444, 1.22090256, 1.11875528, 1.10626926, 1.23724664]),
 1.2044777917681109,
 0.08288650516072944)

In [21]:
lin_scores = cross_val_score(reg, metricArray, targetValues, scoring="neg_root_mean_squared_error", cv=10)

In [22]:
lin_scores = - lin_scores

In [23]:
lin_scores, lin_scores.mean(), lin_scores.std()

(array([0.69669993, 0.9277535 , 1.03775653, 0.9610163 , 0.85388834,
        0.92397938, 0.92664447, 0.83670665, 0.64456593, 0.85329651]),
 0.8662307542861238,
 0.11324418663768956)

In [24]:
sdg_reg = SGDRegressor()
scaler = StandardScaler()
scaledMetric = scaler.fit_transform(metricArray)
sdg_reg.fit(scaledMetric, targetValues)

SGDRegressor()

In [25]:
sdg_scores = cross_val_score(sdg_reg, scaledMetric, targetValues, scoring="neg_root_mean_squared_error", cv=10)

In [26]:
sdg_scores = - sdg_scores

In [27]:
sdg_scores, sdg_scores.mean(), sdg_scores.std()

(array([0.6953959 , 0.93321534, 1.03514224, 0.95471666, 0.86121239,
        0.92928737, 0.92940017, 0.82934937, 0.64422523, 0.85479528]),
 0.8666739943270017,
 0.11348987679227017)

In [28]:
print(f"Linear Regression RMSE:{lin_scores.mean()}")
print(f"Linear Regression STD:{lin_scores.std()}")
print(f"SDG Regression RMSE:{sdg_scores.mean()}")
print(f"SDG Regression STD:{sdg_scores.std()}")
print(f"Decision Tree Regression RMSE:{scores.mean()}")
print(f"Decision Tree Regression STD:{scores.std()}")

Linear Regression RMSE:0.8662307542861238
Linear Regression STD:0.11324418663768956
SDG Regression RMSE:0.8666739943270017
SDG Regression STD:0.11348987679227017
Decision Tree Regression RMSE:1.2044777917681109
Decision Tree Regression STD:0.08288650516072944


In [None]:
#random forests + gradient boosting + pipelining
#correlations between features - pearson correlation
#principle component analysis
#neural nets
#part of speech tagging (nltk, stanford)