First we import our dependencies

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np

# train-test split
from sklearn.model_selection import train_test_split

# loss functions for today
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

# stuff for evaluating classifiers
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt # for displaying a pretty confusion matrix


# dummy models for comparison
from sklearn.dummy import DummyRegressor
from sklearn.dummy import DummyClassifier

# regression models
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

# classification models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier


Count vectorization involves turning a collection of text documents into a matrix of token counts.

In other words, count vectorization means using every observed token (word) across the entire corpus as an attribute (column) and, for each document (row), tallying up how many times each token is observed.

We first need to import the data and inspect it.

In [2]:
df1 = pd.read_csv('./hot_posts.csv')
df2 = pd.read_csv('./top_posts.csv')
df3 = pd.read_csv('./controversial_posts.csv')
df4 = pd.read_csv('./new_posts.csv')
dfMerge = [df1, df2, df3, df4]
df = pd.concat(dfMerge)
df

Unnamed: 0,title,score,id,subreddit,url,num_comments,body,created
0,Subscribe to r/RussiaUKraineWar2022 on Telegra...,473,v0gm37,RussiaUkraineWar2022,https://t.me/UkraineWarPosts,1,,1.653847e+09
1,"10,000 servicemen of the second wave from trai...",14371,y26xch,RussiaUkraineWar2022,https://i.redd.it/5e0wl0p78et91.jpg,562,,1.665589e+09
2,r/RussiaUkraineWar2022 Predictions Tournament,18646,ueslps,RussiaUkraineWar2022,https://reddit.com/r/RussiaUkraineWar2022/pred...,1,,1.665666e+09
3,"I'm safe, fifteen hours with a shovel in my ha...",1519,y39tem,RussiaUkraineWar2022,https://i.redd.it/m4us0z3gzmt91.jpg,160,"If you have a desire to help me, write to me.",1.665695e+09
4,this is my shell. there are many like it. but ...,339,y3hhep,RussiaUkraineWar2022,https://i.redd.it/t9c4v2iqmot91.jpg,25,,1.665714e+09
...,...,...,...,...,...,...,...,...
976,Lieutenant Sergei Didorenko & Senior Lieutenan...,228,xvp4c8,RussiaUkraineWar2022,https://www.reddit.com/gallery/xvp4c8,20,,1.664914e+09
977,Ukrainian forces blew up a Russian ammo cache ...,153,xvp2i0,RussiaUkraineWar2022,https://v.redd.it/3xd2ekcthur91,7,,1.664914e+09
978,Ka-52 pilot Captain Aleksey Belonozhko has bee...,304,xvp1gi,RussiaUkraineWar2022,https://i.redd.it/t6sxzwxlhur91.jpg,17,,1.664914e+09
979,Current frontlines according to Michael McKay,148,xvp0x9,RussiaUkraineWar2022,https://www.reddit.com/gallery/xvp0x9,17,,1.664914e+09


As we can see, there are Titles and Scores. We want to perdict if certain words (x) give a high score (Y)

Now we Vectorise this data so that the model can read it as 1's and 0's (i.e., a list of all the tokens/words present) and inspect term frequencies.)

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()
vect.fit(df["title"])

We then print the vocabulary for the vect we created and check for the amount of words.

In [5]:
vocab_size = len(vect.vocabulary_)

print("Number of vocab before post prossesing: {:.3f}".format(vocab_size))  # 3

Number of vocab before post prossesing: 7059.000


We start by removing all low-frequency words. (code adapted from https://stackoverflow.com/questions/57179045/how-to-remove-less-frequent-words-from-pandas-dataframe)

In [6]:
input_text = df["title"]

# Set the threshold for the minimum number of occurrences, in this case 3
all_ = [x for y in input_text for x in y.split(' ') ]
a, b = np.unique(all_, return_counts = True)
to_remove = a[b < 3]

# Remove the low-frequency words from the "title" column
df["title"] = [' '.join(np.array(y.split(' '))[~np.isin(y.split(' '), to_remove)])
                for y in input_text]

We need to clean this list so that our model only has meaningful words to work with.

One way of doing this is by <strong style="color:red">removing stopwords</strong>.

To do this we can use the ```nltk``` library, which provides a list of stop words for various languages.

In [7]:
import nltk

# Download the list of stop words
nltk.download('stopwords')
nltk.download('punkt')

# Get the list of stop words for the English language
stop_words = nltk.corpus.stopwords.words('english')

# Tokenize the text in the 'text' column
df['title'] = df['title'].apply(nltk.word_tokenize)

# Define a function to remove stop words from a list of tokens
def remove_stop_words(tokens):
    return [token for token in tokens if token.lower() not in stop_words]

# Apply the function to the 'text' column of the dataframe
df['title'] = df['title'].apply(remove_stop_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Seb_R\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


We check to see if the code has removed the stop words

In [8]:
print(df['title'])

0                    [Telegram, unseen, footage, ., 18+]
1      [10,000, servicemen, second, wave, training, U...
2      [r/RussiaUkraineWar2022, Predictions, Tournament]
3      ['m, safe, ,, fifteen, hours, shovel, hands, l...
4                                   [many, like, ., one]
                             ...                        
976    [Lieutenant, Sergei, &, Senior, Lieutenant, Vl...
977    [Ukrainian, forces, blew, Russian, ammo, back,...
978    [Ka-52, pilot, Captain, Aleksey, killed, Ukrai...
979            [Current, frontlines, according, Michael]
980    [Russian, Commander, tank, company, ,, neutral...
Name: title, Length: 3792, dtype: object


As we can see, code has removed the stopwords from our dataframe.

We can now focus on removing low-frequency words to help simplify our dataframe for our model and rerun the count vectorizer to continue building our model.

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()
vect.fit(df['title'])

AttributeError: 'list' object has no attribute 'lower'

This returns an error because an array of strings is what the CountVectorizer expects. Therefore, it will crash if we pass in a nested array of tokens. We therefore need to transform the dataframe.

In [10]:
df["title"] = df["title"].map(' '.join)
vect = CountVectorizer()
vect.fit(df["title"])

In [11]:
vocab_size = len(vect.vocabulary_)

print("Number of vocab before stop-word after post processing: {:.3f}".format(vocab_size))  # 3

Number of vocab before stop-word after post processing: 2177.000


This means that we have removed a lot of words to make our model simplere and hopefully more precise.

We transform this data to an array

In [12]:
Title_text = df["title"]

vector = vect.transform(Title_text)
print(vector.toarray())

[[0 0 0 ... 0 0 0]
 [1 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [13]:
vector.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [14]:
array = vector.toarray()[0]
inv_voc = {v: k for k, v in vect.vocabulary_.items()}
print([inv_voc[x] for x in np.where(array > 1)[0]])

[]


We now need to create the test data and split it into text_test and text_train. We are only looking at the titles.

In [15]:
Title_text[0]

0                    Telegram unseen footage . 18+
0    r/RussiaUkraineWar2022 Predictions Tournament
0                           video Mariupol today .
0       Zelensky offers Russian soldiers surrender
Name: title, dtype: object

In [16]:
X = vector.toarray()
y = df['score']

In [17]:
sum(X[0])

4

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=11122)

We then check to see if the X_train and X_test data contains the same amount of columns but with different amounts of rows.

In [19]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [22]:
X_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

Both X_test and X_train have the same amount of columns - 2,177

In [21]:
y_train

2      7763
104     223
641     981
894     870
25       90
       ... 
323    1423
336     213
849      66
130     107
92        0
Name: score, Length: 2844, dtype: int64

In [None]:
ranreg = RandomForestRegressor()
ranreg.fit(X_train,y_train)

print("Training set score: {:.3f}".format(ranreg.score(X_train, y_train)))
print("Test score: {:.5f}".format(ranreg.score(X_test, y_test)))

In [None]:
linreg = LinearRegression()
linreg.fit(X_train,y_train)

print("Training set score: {:.3f}".format(linreg.score(X_train, y_train)))
print("Test score: {:.5f}".format(linreg.score(X_test, y_test)))

In [None]:
kreg = KNeighborsRegressor()
kreg.fit(X_train,y_train)

print("Training set score: {:.3f}".format(kreg.score(X_train, y_train)))
print("Test score: {:.5f}".format(kreg.score(X_test, y_test)))

By looking at the 3 results above, we see that the <strong style="color:red">Random Forrest Regressor</strong> yields the most accurate results but that KNeighbor Regression had the largest increase in accuracy from the preprocessing. We will try to make both more accurate.

However, both are still very inaccurate and over-fits on the training set. We therefore have to do some more data treatment with our model to get a better more accurate result.

This can be done in several ways:
1. Increasing the number of trees in the forest (provided that the model is not already overfitting)
2. Tuning the hyperparameters of the individual trees, such as the maximum depth of the trees and the minimum number of samples required to split a node
3. Using a better quality training dataset that has more relevant features and less noise
4. Using cross-validation to evaluate the model and select the best performing set of hyperparameters
5. Ensembling multiple random forest models with different parameters to improve the overall accuracy of the model.

We know that the model is overfitting already, so increasing the number of trees in the forest, will most likely not improve the accuracy.

Therefore, we will start by tweaking the hyperparameters to get a more accurate result.
To tune the hyperparameters of our regression model, we will use the RandomizedSearchCV class from the scikit-learn library. This class allows us to define a range of hyperparameters to search over, and it will automatically evaluate a random sampling of combinations of these hyperparameters to find the best performing set of parameters.

In [None]:
# Import the necessary modules
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define the parameter space to search over
param_distributions = {
    'max_depth': randint(1, 10),
    'min_samples_split': randint(2, 10),
    'n_estimators': randint(10, 100),
}

# Define the random forest regressor
regressor = RandomForestRegressor(random_state=42)

# Define the search method
search = RandomizedSearchCV(regressor, param_distributions, cv=5, random_state=42)

# Perform the search
search.fit(X, y)

# Print the best set of hyperparameters
print('Best hyperparameters:', search.best_params_)

Now that we know what hyperparameters that best fit our data, we'll use this to try and improve the results of our model.

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Create an instance of the RandomForestRegressor class
model = RandomForestRegressor()

# Set the hyperparameters of the model
model.n_estimators = 50  # Number of trees in the forest
model.max_depth = 30  # Maximum depth of the tree
model.min_samples_split = 10  # Minimum number of samples required to split a node
model.min_samples_leaf = 1

model.fit(X_train,y_train)
print("Training set score: {:.3f}".format(model.score(X_train, y_train)))
print("Test score: {:.5f}".format(model.score(X_test, y_test)))

We'll also search on the best parameters on KNeightborsRegression since it had the largest improvement on the before and after the data preprocessing.

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

# Create a KNN Regressor model
model = KNeighborsRegressor()

# Define the hyperparameter grid
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

# Create a grid search object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print(grid_search.best_params_)

In [None]:
# Create an instance of the RandomForestRegressor class
model = KNeighborsRegressor()

# Set the hyperparameters of the model
model.n_neighbors = 9  # Number of trees in the forest
model.p = 2  # Maximum depth of the tree
model.weights = 'distance'  # Minimum number of samples required to split a node

model.fit(X_train,y_train)
print("Training set score: {:.3f}".format(model.score(X_train, y_train)))
print("Test score: {:.5f}".format(model.score(X_test, y_test)))