In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import string
import nltk
nltk.download("punkt_tab")
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/bhoitechut69/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/bhoitechut69/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/bhoitechut69/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/bhoitechut69/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/bhoitechut69/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### **Data_Cleaning_etc**

In [2]:
true_df = pd.read_csv("archive/True.csv")
fake_df = pd.read_csv("archive/Fake.csv")

In [3]:
true_df = true_df[['text']]
true_df['y'] = 1

fake_df = fake_df[['text']]
fake_df['y'] = 0

In [4]:
data = pd.concat([true_df,fake_df]).reset_index(drop="True")

In [5]:
data.head()

Unnamed: 0,text,y
0,WASHINGTON (Reuters) - The head of a conservat...,1
1,WASHINGTON (Reuters) - Transgender people will...,1
2,WASHINGTON (Reuters) - The special counsel inv...,1
3,WASHINGTON (Reuters) - Trump campaign adviser ...,1
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,1


In [6]:
print(data.isnull().sum())

text    0
y       0
dtype: int64


In [7]:
def clean(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub("  "," ",text)
    test = text.strip()
    return text

  text = re.sub('\[.*?\]', '', text)
  text = re.sub('https?://\S+|www\.\S+', '', text)
  text = re.sub('\w*\d\w*', '', text)


In [8]:
X = data['text'].apply(clean)
y = data['y']

In [9]:
X.head()

0    washington reuters the head of a conservative ...
1    washington reuters transgender people will be ...
2    washington reuters the special counsel investi...
3    washington reuters trump campaign adviser geor...
4    seattlewashington reuters president donald tru...
Name: text, dtype: object

In [10]:
stop = set(stopwords.words("english"))
lemm = WordNetLemmatizer()
def token_lemm(text):
    token = word_tokenize(text)
    token = [i for i in token if i not in stop]
    token = [lemm.lemmatize(i) for i in token]
    return ' '.join(token)

In [11]:
X = X.apply(token_lemm)

In [12]:
X_train, X_val, y_train, y_val = train_test_split(X,y,train_size=0.6, random_state=69)
X_val, X_test, y_val, y_test = train_test_split(X_val,y_val,train_size=0.5, random_state=69)

In [13]:
vectorizer = TfidfVectorizer()

X_train_vectorized = vectorizer.fit_transform(X_train)
X_val_vectorized = vectorizer.transform(X_val)
X_test_vectorized = vectorizer.transform(X_test)

In [21]:
from Decision_Tree import DecisionTreeClassifier_my
my_dt = DecisionTreeClassifier_my(min_sample_split=5, max_depth=4)
y_arr = np.array(y_train)
my_dt.root = my_dt.build_tree(X_train_vectorized, y_arr)

acc = my_dt.accuracy(X_train_vectorized, y_arr)
print(f"Accuracy on training data: {acc*100:.2f}%")

ValueError: zero-dimensional arrays cannot be concatenated

### **DecisionTree**

In [262]:
param_grid_tree = {
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [10, 20, 50],
    'min_samples_leaf': [5, 10, 20],
    'max_features': ['sqrt', 'log2']
}
my_tree = DecisionTreeClassifier(random_state=69)

grid_search_tree = GridSearchCV(estimator=my_tree, param_grid=param_grid_tree, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

In [263]:
grid_search_tree.fit(X_train_vectorized, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [264]:
print("Best parameters found:", grid_search_tree.best_params_)
best_model = grid_search_tree.best_estimator_

Best parameters found: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 50}


In [265]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = best_model.predict(X_val_vectorized)
print("Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

Accuracy: 0.6766146993318486
              precision    recall  f1-score   support

           0       0.64      0.87      0.74      4679
           1       0.76      0.47      0.58      4301

    accuracy                           0.68      8980
   macro avg       0.70      0.67      0.66      8980
weighted avg       0.70      0.68      0.66      8980



In [266]:
y_test_pred = best_model.predict(X_test_vectorized)
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.6765033407572383
              precision    recall  f1-score   support

           0       0.52      0.70      0.60      4688
           1       0.47      0.29      0.36      4292

    accuracy                           0.51      8980
   macro avg       0.50      0.50      0.48      8980
weighted avg       0.50      0.51      0.49      8980



### **RandomForest**

In [267]:
my_forest = RandomForestClassifier(random_state=69, class_weight='balanced')
param_grid = {
    'n_estimators': [200, 300],  
    'max_depth': [10, 20, 30],       
    'min_samples_leaf': [4, 5] 
}
grid_search = GridSearchCV(estimator=my_forest, param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1, verbose=1)

In [268]:
grid_search.fit(X_train_vectorized, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [269]:
print("Best parameters found:", grid_search.best_params_)
best_model = grid_search.best_estimator_

Best parameters found: {'max_depth': 30, 'min_samples_leaf': 4, 'n_estimators': 300}


In [270]:
y_pred = best_model.predict(X_val_vectorized)
print("Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

Accuracy: 0.9809576837416482
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      4679
           1       0.98      0.98      0.98      4301

    accuracy                           0.98      8980
   macro avg       0.98      0.98      0.98      8980
weighted avg       0.98      0.98      0.98      8980



In [271]:
y_test_pred = best_model.predict(X_test_vectorized)
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9802895322939866
              precision    recall  f1-score   support

           0       0.53      0.52      0.52      4688
           1       0.48      0.49      0.49      4292

    accuracy                           0.50      8980
   macro avg       0.50      0.50      0.50      8980
weighted avg       0.51      0.50      0.51      8980



### **NaiveBayes**

In [272]:
Naive_Bayes = MultinomialNB()
param_grid_nb = {'alpha':[1e-3,1e-2,0.1]}
grid_search = GridSearchCV(estimator=Naive_Bayes, param_grid=param_grid_nb, cv=5, scoring='f1', n_jobs=-1, verbose=1)

In [273]:
grid_search.fit(X_train_vectorized, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


In [274]:
print("Best parameters found:", grid_search.best_params_)
best_model = grid_search.best_estimator_

Best parameters found: {'alpha': 0.001}


In [275]:
y_pred = best_model.predict(X_val_vectorized)
print("Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

Accuracy: 0.9555679287305122
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      4679
           1       0.97      0.94      0.95      4301

    accuracy                           0.96      8980
   macro avg       0.96      0.95      0.96      8980
weighted avg       0.96      0.96      0.96      8980



In [276]:
y_test_pred = best_model.predict(X_test_vectorized)
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9551224944320713
              precision    recall  f1-score   support

           0       0.53      0.54      0.53      4688
           1       0.48      0.47      0.48      4292

    accuracy                           0.51      8980
   macro avg       0.50      0.50      0.50      8980
weighted avg       0.51      0.51      0.51      8980



### **SVM**

In [277]:
model_svm = LinearSVC()
param_grid_svm = {'C': [0.1,1,10]}
grid_search = GridSearchCV(estimator=model_svm, param_grid=param_grid_svm, cv=5, scoring='f1', n_jobs=-1, verbose=1)

In [278]:
grid_search.fit(X_train_vectorized, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits




In [279]:
print("Best parameters found:", grid_search.best_params_)
best_model = grid_search.best_estimator_

Best parameters found: {'C': 10}


In [280]:
y_pred = best_model.predict(X_val_vectorized)
print("Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

Accuracy: 0.9930957683741648
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4679
           1       0.99      0.99      0.99      4301

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [281]:
y_test_pred = best_model.predict(X_test_vectorized)
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9927616926503341
              precision    recall  f1-score   support

           0       0.53      0.52      0.53      4688
           1       0.48      0.48      0.48      4292

    accuracy                           0.51      8980
   macro avg       0.50      0.50      0.50      8980
weighted avg       0.51      0.51      0.51      8980



In [282]:
print("hi")

hi
