# Gradient Boosting Classifier

## Import the required libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import nltk
import os

## Construct paths using os.path.join

In [2]:
true_news_path = os.path.join('..', 'artifacts', 'preprocessed_true.csv')
fake_news_path = os.path.join('..', 'artifacts', 'preprocessed_fake.csv')

## Read the CSV files

In [3]:
true_news = pd.read_csv(true_news_path)
fake_news = pd.read_csv(fake_news_path)

## Add a label column to each dataframe

In [4]:
true_news['label'] = 'true'
fake_news['label'] = 'fake'

## Combine the datasets

In [5]:
data = pd.concat([true_news, fake_news])

## Display first few rows

In [6]:
print(data.head())

                                               title  \
0  Obama paints Trump as no friend of the working...   
1  Justice Department seeks warrant to seize anci...   
2  Democratic nomination will likely be won in Ma...   
4  Peru's central bank chief says Trump gives him...   

                                                text       subject  \
0  philadelphia reuters hillary clinton sidelined...  politicsNews   
1  washington reuters u justice department said w...  politicsNews   
2  new york reuters hillary clinton ’ campaign ma...  politicsNews   
3  baghdad reuters kurdish peshmerga fighter reje...     worldnews   
4  lima reuters head peru ’ central bank said wed...  politicsNews   

                  date label  
0  September 13, 2016   true  
1    December 6, 2017   true  
2   February 10, 2016   true  
3    October 14, 2017   true  
4   September 8, 2016   true  


## Check for missing values

In [7]:
print(data.isnull().sum())

title        0
text       632
subject      0
date         0
label        0
dtype: int64


## Drop rows with missing 'text' or 'title' values

In [8]:
data = data.dropna(subset=['text', 'title'])

## Combine 'title' and 'text' for vectorization

In [9]:
data['combined_text'] = data['title'] + ' ' + data['text']

## Encode labels

In [10]:
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])

## Split the data

In [11]:
train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42)
test_data, val_data = train_test_split(temp_data, test_size=0.33, random_state=42)

## Display the size of each split

In [12]:
print(f'Train data size: {train_data.shape}')
print(f'Test data size: {test_data.shape}')
print(f'Validation data size: {val_data.shape}')

Train data size: (30986, 6)
Test data size: (8897, 6)
Validation data size: (4383, 6)


## Vectorize the text data

In [13]:
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train = vectorizer.fit_transform(train_data['combined_text'])
X_test = vectorizer.transform(test_data['combined_text'])
X_val = vectorizer.transform(val_data['combined_text'])

y_train = train_data['label']
y_test = test_data['label']
y_val = val_data['label']

## Initialize and train the Gradient Boosting model

In [14]:
model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
model.fit(X_train, y_train)

## Make predictions

In [15]:
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
y_pred_val = model.predict(X_val)

## Evaluate the model

In [16]:
print("Training Data Evaluation")
print(classification_report(y_train, y_pred_train))
print(confusion_matrix(y_train, y_pred_train))

print("Test Data Evaluation")
print(classification_report(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))

print("Validation Data Evaluation")
print(classification_report(y_val, y_pred_val))
print(confusion_matrix(y_val, y_pred_val))

Training Data Evaluation
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16001
           1       1.00      1.00      1.00     14985

    accuracy                           1.00     30986
   macro avg       1.00      1.00      1.00     30986
weighted avg       1.00      1.00      1.00     30986

[[15937    64]
 [   23 14962]]
Test Data Evaluation
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      4593
           1       0.99      1.00      0.99      4304

    accuracy                           0.99      8897
   macro avg       0.99      0.99      0.99      8897
weighted avg       0.99      0.99      0.99      8897

[[4560   33]
 [  14 4290]]
Validation Data Evaluation
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      2256
           1       0.99      1.00      0.99      2127

    accuracy                           0.99      43

## Assuming you have predicted values y_pred and actual values y_true

In [17]:
y_pred = model.predict(X_test)
y_true = y_test  # Assuming y_test is the true labels

## Function to calculate metrics

In [18]:
def model_evaluation(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, r2

## Calculate metrics

In [19]:
mae, rmse, r2_square = model_evaluation(y_true, y_pred)

## Print the results in the desired format

In [20]:
print("Model Training Performance")
print("RMSE:", rmse)
print("MAE:", mae)
print("R2 score:", r2_square * 100)

print('*' * 35)
print('\n')

Model Training Performance
RMSE: 0.07268204424000571
MAE: 0.005282679554906148
R2 score: 97.88469624456806
***********************************


