In [None]:
import pandas as pd
import numpy as np

complete_data = pd.read_parquet('p4_rich_tokens.parquet')

In [None]:
# remove all rows with duplicate "rich_tokens" values:
print('Removing duplicates...')
complete_data = complete_data.drop_duplicates(subset='rich_tokens') 
# save as parquet again:
complete_data.to_parquet('pre_processed_news.parquet')
print('Done: ', len(complete_data), ' rows left.')

Yes, it took me this long to find that so many articles were duplicated. Whoops.

In [2]:
import pandas as pd
import numpy as np

complete_data = pd.read_parquet('pre_processed_news.parquet')

In [3]:
print(complete_data.shape)
print(complete_data.columns)

(735518, 2)
Index(['type', 'cleaned_content'], dtype='object')


In [5]:
# remove all rows with Na values in the 'type' column:
print('Removing rows with NA values in the type column...')
complete_data = complete_data.dropna(subset=['type'])
print('Done: ', len(complete_data), ' rows left.')

Removing rows with NA values in the type column...
Done:  735518  rows left.


In [6]:
# Save the data as parquet again, but only with 'type' and 'cleaned_content' columns:
complete_data[['type', 'cleaned_content']].to_parquet('pre_processed_news.parquet')
print('Done.')

Done.


In [2]:
import pandas as pd
import numpy as np

complete_data = pd.read_parquet('pre_processed_news.parquet')

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(complete_data['cleaned_content'], complete_data['type'], test_size=0.2, random_state=42)
y_train = y_train.apply(lambda x: True if x in ["reliable", "political"] else False)
y_test = y_test.apply(lambda x: True if x in ["reliable", "political"] else False)

# Let's split test into test and validation:
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

# print:
print('X_train: ', X_train.shape)
print('X_test: ', X_test.shape)
print('y_train: ', y_train.shape)
print('y_test: ', y_test.shape)
print('X_val: ', X_val.shape)
print('y_val: ', y_val.shape)


X_train:  (588414,)
X_test:  (73552,)
y_train:  (588414,)
y_test:  (73552,)
X_val:  (73552,)
y_val:  (73552,)


In [5]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9, min_df=0.005)

print('Fitting the vectorizer...')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
print('Done. matrix: ', X_train_tfidf.shape)


Fitting the vectorizer...
Done. matrix:  (588414, 5995)


In [6]:
clf = MultinomialNB()
print('Fitting the model...')
clf.fit(X_train_tfidf, y_train)
print('Done.')

Fitting the model...
Done.


In [7]:
predicted = clf.predict(X_test_tfidf)

accuracy = metrics.accuracy_score(y_test, predicted)
print('Accuracy: ', accuracy)

#print(metrics.classification_report(y_test, predicted, target_names=complete_data.target_names))


Accuracy:  0.7819773765499238


# Log reg

In [6]:
# Let's now use logistic regression:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
print('Fitting the model...')
clf.fit(X_train_tfidf, y_train)
print('Done.')

Fitting the model...
Done.


In [7]:
#predict:
predicted = clf.predict(X_test_tfidf)

accuracy = metrics.accuracy_score(y_test, predicted)
print('Accuracy: ', accuracy)


Accuracy:  0.8331656515118555


# Random forest

In [20]:
# Let's now use a random forest:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=200, random_state=42, max_depth=15, max_features='sqrt', n_jobs=-1)
print('Fitting the model...')
clf.fit(X_train_tfidf, y_train)
print('Done.')

Fitting the model...
Done.


In [21]:
# print info on the forest:
print('Feature importances: ', clf.feature_importances_)
print('Number of estimators: ', clf.n_estimators)
print('Max depth: ', clf.max_depth)

Feature importances:  [2.23950800e-06 2.89271245e-06 1.79770102e-06 ... 1.11696774e-05
 6.83631742e-06 3.35585754e-06]
Number of estimators:  200
Max depth:  15


In [22]:
# predict:
predicted = clf.predict(X_test_tfidf)

accuracy = metrics.accuracy_score(y_test, predicted)

print('Accuracy: ', accuracy)

Accuracy:  0.7483956928431585


In [31]:
clf2 = RandomForestClassifier(n_estimators=300, random_state=42, max_depth=15, max_features='sqrt', n_jobs=-1)
print('Fitting the model...')
clf2.fit(X_train_tfidf, y_train)

Fitting the model...


In [32]:
predicted = clf2.predict(X_test_tfidf)

accuracy = metrics.accuracy_score(y_test, predicted)

print('Accuracy: ', accuracy)

Accuracy:  0.7479742223189036
