In [1]:
import pandas as pd
submissions = pd.read_csv("sel_hn_stories.csv")
submissions.columns = ["submission_time", "upvotes", "url", "headline"]
submissions = submissions.dropna()
print(submissions.head())

        submission_time  upvotes                  url  \
0  2010-02-17T16:57:59Z        1  blog.jonasbandi.net   
1  2014-02-04T02:36:30Z        1        blogs.wsj.com   
2  2011-10-26T07:11:29Z        1       threatpost.com   
3  2011-04-03T15:43:44Z       67     algorithm.com.au   
4  2013-01-13T16:49:20Z        1      winmacsofts.com   

                                            headline  
0  Software: Sadly we did adopt from the construc...  
1   Google’s Stock Split Means More Control for L...  
2  SSL DOS attack tool released exploiting negoti...  
3       Immutability and Blocks Lambdas and Closures  
4         Comment optimiser la vitesse de Wordpress?  


In [2]:
tokenized_headlines = []
for i in range(submissions.shape[0]):
    tokens = submissions['headline'].iloc[i].split(" ")
    tokenized_headlines.append(tokens)

In [3]:
punctuation = [",", ":", ";", ".", "'", '"', "’", "?", "/", "-", "+", "&", "(", ")"]
clean_tokenized = []
for item in tokenized_headlines:
    tokens = []
    for token in item:
        token = token.lower()
        for punc in punctuation:
            token = token.replace(punc, "")
        tokens.append(token)
    clean_tokenized.append(tokens)

In [4]:
import numpy as np

unique_tokens = []
single_tokens = []

for item in clean_tokenized:
    for word in item:
        if word in single_tokens:
            if not word in unique_tokens:
                unique_tokens.append(word)
        else:
            single_tokens.append(word)

counts = pd.DataFrame(0, index=np.arange(len(clean_tokenized)),
                      columns = unique_tokens)

In [5]:
for index,item in enumerate(clean_tokenized):
    for token in item:
        if token in unique_tokens:
            counts.iloc[index][token] += 1
    

In [6]:
word_counts = counts.sum(axis=0)

counts = counts.loc[:,(word_counts >=5) & (word_counts <= 100)]

In [37]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(counts, submissions["upvotes"], test_size=0.20, random_state=1)

In [38]:
from sklearn.linear_model import LinearRegression

clf = LinearRegression()
clf.fit(X_train,y_train)
predictions = clf.predict(X_test)

In [39]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test,predictions)
print(mse)

2652.608251252287


The mean number of upvotes is 10, and the standard deviation is 39.5.
If we take the square root of our MSE to calculate error in terms of upvotes, we get 46.7. 
This means that our average error is 46.7 upvotes away from the true value. 
This is higher than the standard deviation, so our predictions are often far off-base.

In [42]:
from sklearn.ensemble import RandomForestRegressor

regr = RandomForestRegressor(n_estimators=50,max_depth=10)
regr.fit(X_train, y_train)
pred = regr.predict(X_test)
mse2 = mean_squared_error(y_test,pred)
print(mse2)
print(mse2 ** 0.5)

1952.3016541903314
44.18485774776616


In [41]:
regr2 = RandomForestRegressor(n_estimators=50,max_depth=10)
regr2.fit(X_train, y_train)
pred2 = regr.predict(X_train)
mse3 = mean_squared_error(y_train,pred2)
print(mse3)

849.7888925753692
