First we import our dependencies

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np

# train-test split
from sklearn.model_selection import train_test_split

# loss functions for today
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

# stuff for evaluating classifiers
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt # for displaying a pretty confusion matrix


# dummy models for comparison
from sklearn.dummy import DummyRegressor
from sklearn.dummy import DummyClassifier

# regression models
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

# classification models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier


Count vectorization involves turning a collection of text documents into a matrix of token counts.

In other words, count vectorization means using every observed token (word) across the entire corpus as an attribute (column) and, for each document (row), tallying up how many times each token is observed.

We first need to import the data and inspect it.

In [2]:
df1 = pd.read_csv('./hot_posts.csv')
df2 = pd.read_csv('./top_posts.csv')
df3 = pd.read_csv('./controversial_posts.csv')
df4 = pd.read_csv('./new_posts.csv')
dfMerge = [df1, df2, df3, df4]
df = pd.concat(dfMerge)
df

Unnamed: 0,title,score,id,subreddit,url,num_comments,body,created
0,Subscribe to r/RussiaUKraineWar2022 on Telegra...,473,v0gm37,RussiaUkraineWar2022,https://t.me/UkraineWarPosts,1,,1.653847e+09
1,"10,000 servicemen of the second wave from trai...",14371,y26xch,RussiaUkraineWar2022,https://i.redd.it/5e0wl0p78et91.jpg,562,,1.665589e+09
2,r/RussiaUkraineWar2022 Predictions Tournament,18646,ueslps,RussiaUkraineWar2022,https://reddit.com/r/RussiaUkraineWar2022/pred...,1,,1.665666e+09
3,"I'm safe, fifteen hours with a shovel in my ha...",1519,y39tem,RussiaUkraineWar2022,https://i.redd.it/m4us0z3gzmt91.jpg,160,"If you have a desire to help me, write to me.",1.665695e+09
4,this is my shell. there are many like it. but ...,339,y3hhep,RussiaUkraineWar2022,https://i.redd.it/t9c4v2iqmot91.jpg,25,,1.665714e+09
...,...,...,...,...,...,...,...,...
976,Lieutenant Sergei Didorenko & Senior Lieutenan...,228,xvp4c8,RussiaUkraineWar2022,https://www.reddit.com/gallery/xvp4c8,20,,1.664914e+09
977,Ukrainian forces blew up a Russian ammo cache ...,153,xvp2i0,RussiaUkraineWar2022,https://v.redd.it/3xd2ekcthur91,7,,1.664914e+09
978,Ka-52 pilot Captain Aleksey Belonozhko has bee...,304,xvp1gi,RussiaUkraineWar2022,https://i.redd.it/t6sxzwxlhur91.jpg,17,,1.664914e+09
979,Current frontlines according to Michael McKay,148,xvp0x9,RussiaUkraineWar2022,https://www.reddit.com/gallery/xvp0x9,17,,1.664914e+09


As we can see, there are Titles and Scores. We want to perdict if certain words (x) give a high score (Y)

We first need to define our Title_text

In [3]:
Title_text = df['title']
Title_text

0      Subscribe to r/RussiaUKraineWar2022 on Telegra...
1      10,000 servicemen of the second wave from trai...
2          r/RussiaUkraineWar2022 Predictions Tournament
3      I'm safe, fifteen hours with a shovel in my ha...
4      this is my shell. there are many like it. but ...
                             ...                        
976    Lieutenant Sergei Didorenko & Senior Lieutenan...
977    Ukrainian forces blew up a Russian ammo cache ...
978    Ka-52 pilot Captain Aleksey Belonozhko has bee...
979        Current frontlines according to Michael McKay
980    Russian Commander of a tank company, Negmonov ...
Name: title, Length: 3792, dtype: object

Now we Vectorise this data so that the model can read it as 1's and 0's (i.e., a list of all the tokens/words present) and inspect term frequencies.)

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()
vect.fit(Title_text)

We then print the vocabulary for the vect we created

In [5]:
vect.vocabulary_

{'subscribe': 5995,
 'to': 6295,
 'russiaukrainewar2022': 5349,
 'on': 4349,
 'telegram': 6176,
 'for': 2544,
 'the': 6222,
 'fastest': 2388,
 'updates': 6554,
 'and': 542,
 'more': 4063,
 'unseen': 6543,
 'footage': 2542,
 '18': 65,
 '10': 12,
 '000': 1,
 'servicemen': 5545,
 'of': 4308,
 'second': 5477,
 'wave': 6783,
 'from': 2598,
 'training': 6371,
 'in': 3099,
 'uk': 6482,
 'are': 627,
 'returning': 5214,
 'ukraine': 6485,
 'predictions': 4776,
 'tournament': 6341,
 'safe': 5371,
 'fifteen': 2438,
 'hours': 2997,
 'with': 6861,
 'shovel': 5620,
 'my': 4130,
 'hands': 2853,
 'have': 2878,
 'own': 4445,
 'little': 3629,
 'house': 2998,
 'this': 6246,
 'is': 3257,
 'shell': 5579,
 'there': 6230,
 'many': 3806,
 'like': 3604,
 'it': 3269,
 'but': 1159,
 'one': 4351,
 'mine': 3973,
 'aleksey': 487,
 'martynov': 3836,
 'mobilized': 4027,
 'moscow': 4068,
 'government': 2743,
 'official': 4321,
 'returned': 5213,
 'he': 2881,
 'was': 6771,
 'september': 5528,
 '23': 115,
 'october': 430

We transform this to an array

In [6]:
vector = vect.transform(Title_text)
print(vector.toarray())

[[0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [7]:
vector.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [8]:
array = vector.toarray()[0]
inv_voc = {v: k for k, v in vect.vocabulary_.items()}
print([inv_voc[x] for x in np.where(array > 1)[0]])


[]


We now need to create the test data and split it into text_test and text_train. We are only looking at the titles.

In [9]:
Title_text[0]

0    Subscribe to r/RussiaUKraineWar2022 on Telegra...
0        r/RussiaUkraineWar2022 Predictions Tournament
0          Russell Bently's video from Mariupol today.
0    Zelensky offers guarantees for Russian soldier...
Name: title, dtype: object

In [10]:
X = vector.toarray()
y = df['score']

In [11]:
sum(X[0])

14

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=11122)

In [13]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [14]:
X_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [15]:
y_train

2      7763
104     223
641     981
894     870
25       90
       ... 
323    1423
336     213
849      66
130     107
92        0
Name: score, Length: 2844, dtype: int64

In [16]:
ranreg = RandomForestRegressor()
ranreg.fit(X_train,y_train)

print("Training set score: {:.3f}".format(ranreg.score(X_train, y_train)))
print("Test score: {:.5f}".format(ranreg.score(X_test, y_test)))

Training set score: 0.934
Test score: 0.39567


In [17]:
linreg = LinearRegression()
linreg.fit(X_train,y_train)

print("Training set score: {:.3f}".format(linreg.score(X_train, y_train)))
print("Test score: {:.5f}".format(linreg.score(X_test, y_test)))

Training set score: 0.996
Test score: -4658690083048234418176.00000


In [18]:
kreg = KNeighborsRegressor()
kreg.fit(X_train,y_train)

print("Training set score: {:.3f}".format(kreg.score(X_train, y_train)))
print("Test score: {:.5f}".format(kreg.score(X_test, y_test)))

Training set score: 0.469
Test score: -0.13552
