## Text Preprocessing

In [47]:
# Pakages
import spacy
import pandas as pd
import re

In [48]:
# Spacy Object
processing = spacy.load('en_core_web_md')

In [49]:
# Load the data
df = pd.read_csv('news_classifier_dataset.csv')
print(df.shape)
df.head()

(1003, 2)


Unnamed: 0,Header,Section
0,Companies — profitable or not — make 2024 the ...,Automobile
1,What the U.S. can learn from Norway when it co...,Automobile
2,UAW threatens to strike Ford’s Kentucky Truck ...,Automobile
3,2 takeaways from Ford CEO Jim Farley that boos...,Automobile
4,"Ford CEO says forget Tesla, ‘Pro’ unit is auto...",Automobile


In [50]:
# Text Preprocessing using Spacy
corpus = []

for i in df['Header']:
    # Replacing numbers and Special Characters with whitespace
    news = re.sub('[^a-zA-Z\s]', '', i)

    # Convert the String to lowercase
    news = news.lower()

    # Removing Stop word and Lemmatization
    doc = processing(news)
    
    news = [token.lemma_ for token in doc if not token.is_stop]
    news = ' '.join(news)

    corpus.append({'Header':news})

  news = re.sub('[^a-zA-Z\s]', '', i)


In [51]:
len(corpus)

1003

In [52]:
processed_header = pd.DataFrame(data = corpus, columns = ['Header'])

section_df = df['Section']

processed_df = pd.concat([processed_header, section_df], axis = 1)
processed_df

Unnamed: 0,Header,Section
0,company profitable year cost cut,Automobile
1,learn norway come ev adoption,Automobile
2,uaw threaten strike ford kentucky truck plant ...,Automobile
3,takeaway ford ceo jim farley boost confidenc...,Automobile
4,ford ceo say forget tesla pro unit auto indust...,Automobile
...,...,...
998,founder wise skype raise million build tech ...,Technology
999,indias zee entertainment dive sony call mega...,Technology
1000,tencent riot game division cut staff create ...,Technology
1001,youtube star mrbeast make x video call bit f...,Technology


In [53]:
# Text preprocessing function
def processing(text):

    processing = spacy.load('en_core_web_md')
    
    corpus = []

    # Replacing numbers and Special Characters with whitespace
    news = re.sub('[^a-zA-Z\s]', '', text)

    # Convert the String to lowercase
    news = news.lower()

    # Removing Stop word and Lemmatisation
    doc = processing(news)
    
    news = [token.lemma_ for token in doc if not token.is_stop]
    news = ' '.join(news)

    corpus.append({'Header':news})

    processed_header = pd.DataFrame(data = corpus, columns = ['Header'])

    return processed_header

  news = re.sub('[^a-zA-Z\s]', '', text)


## Model Building

In [54]:
# Train Test Split
from sklearn.model_selection import train_test_split

# Vectorisation module
from sklearn.feature_extraction.text import TfidfVectorizer

# Classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# Evaluation metrics
from sklearn.metrics import accuracy_score

In [55]:
df = processed_df
df.head()

Unnamed: 0,Header,Section
0,company profitable year cost cut,Automobile
1,learn norway come ev adoption,Automobile
2,uaw threaten strike ford kentucky truck plant ...,Automobile
3,takeaway ford ceo jim farley boost confidenc...,Automobile
4,ford ceo say forget tesla pro unit auto indust...,Automobile


In [56]:
df['Section'].value_counts()

Section
Health_and_Science    201
Investing             201
Technology            201
Automobile            200
Politics              200
Name: count, dtype: int64

In [57]:
df['Target'] = df['Section'].map({'Technology':0, 'Automobile':1,'Health_and_Science':2, 'Investing':3, 'Politics':4})

In [58]:
df['Target'].head()

0    1
1    1
2    1
3    1
4    1
Name: Target, dtype: int64

In [59]:
# Train Test Split
x = df['Header']
y = df['Target']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)
print(x_train.shape, x_test.shape)

(802,) (201,)


In [60]:
# Vectorization of text
tfidf = TfidfVectorizer()

x_train_vectorized = tfidf.fit_transform(x_train)
x_test_vectorized  = tfidf.transform(x_test)

In [61]:
# Defining the model
def model_training(model_name, x_train, y_train):
    if model_name == 'RandomForestClassifier':
        model = RandomForestClassifier(random_state = 42)

    elif model_name == 'XGBClassifier':
        model = XGBClassifier(random_state = 42)

    elif model_name == 'SVC':
        model = SVC(random_state = 42)
 
    elif model_name == 'KNeighborsClassifier':
        model = KNeighborsClassifier()        

    elif model_name == 'LogisticRegression':
        model = LogisticRegression(random_state = 42) 

    elif model_name == 'DecisionTreeClassifier':
        model = DecisionTreeClassifier(random_state = 42) 
    
    model.fit(x_train, y_train)

    return model

In [62]:
# Evaluation

def evaluation_metrics(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)

    return f'{round(accuracy*100)}%'

#### Random Forest Classifier

In [63]:
rfc = model_training('RandomForestClassifier', x_train_vectorized, y_train)

y_train_pred_1 = rfc.predict(x_train_vectorized)
y_test_pred_1 = rfc.predict(x_test_vectorized)

train_score_1 = evaluation_metrics(y_train, y_train_pred_1)
print('The Accuracy score of training set :',train_score_1)

test_score_1 = evaluation_metrics(y_test, y_test_pred_1)
print('The Accuracy score of testing set :',test_score_1)

The Accuracy score of training set : 98%
The Accuracy score of testing set : 75%


#### XGB Classifier

In [64]:
xgbc = model_training('XGBClassifier', x_train_vectorized, y_train)

y_train_pred_2 = xgbc.predict(x_train_vectorized)
y_test_pred_2 = xgbc.predict(x_test_vectorized)

train_score_2 = evaluation_metrics(y_train, y_train_pred_2)
print('The Accuracy score of training set :',train_score_2)

test_score_2 = evaluation_metrics(y_test, y_test_pred_2)
print('The Accuracy score of testing set :',test_score_2)

The Accuracy score of training set : 97%
The Accuracy score of testing set : 71%


#### Support Vector Machine

In [70]:
svc = model_training('SVC', x_train_vectorized, y_train)

y_train_pred_3 = svc.predict(x_train_vectorized)
y_test_pred_3 = svc.predict(x_test_vectorized)

train_score_3 = evaluation_metrics(y_train, y_train_pred_3)
print('The Accuracy score of training set :',train_score_3)

test_score_3 = evaluation_metrics(y_test, y_test_pred_3)
print('The Accuracy score of testing set :',test_score_3)

The Accuracy score of training set : 98%
The Accuracy score of testing set : 78%


#### KNeighbors Classifier

In [66]:
knc = model_training('KNeighborsClassifier', x_train_vectorized, y_train)

y_train_pred_4 = knc.predict(x_train_vectorized)
y_test_pred_4 = knc.predict(x_test_vectorized)

train_score_4 = evaluation_metrics(y_train, y_train_pred_4)
print('The Accuracy score of training set :',train_score_4)

test_score_4 = evaluation_metrics(y_test, y_test_pred_4)
print('The Accuracy score of testing set :',test_score_4)

The Accuracy score of training set : 81%
The Accuracy score of testing set : 74%


#### Logistic Regression

In [69]:
lr = model_training('LogisticRegression', x_train_vectorized, y_train)

y_train_pred_5 = lr.predict(x_train_vectorized)
y_test_pred_5 = lr.predict(x_test_vectorized)

train_score_5 = evaluation_metrics(y_train, y_train_pred_5)
print('The Accuracy score of training set :',train_score_5)

test_score_5 = evaluation_metrics(y_test, y_test_pred_5)
print('The Accuracy score of testing set :',test_score_5)

The Accuracy score of training set : 95%
The Accuracy score of testing set : 78%


#### Decision Tree Classifier

In [68]:
dtc = model_training('DecisionTreeClassifier', x_train_vectorized, y_train)

y_train_pred_6 = dtc.predict(x_train_vectorized)
y_test_pred_6 = dtc.predict(x_test_vectorized)

train_score_6 = evaluation_metrics(y_train, y_train_pred_6)
print('The Accuracy score of training set :',train_score_6)

test_score_6 = evaluation_metrics(y_test, y_test_pred_6)
print('The Accuracy score of testing set :',test_score_6)

The Accuracy score of training set : 98%
The Accuracy score of testing set : 66%


##### LogisticRegression Performs better with a Accuracy score of 95% in training set and a Accuracy score of 78% in testing set

#### Hyper Parameter Tuning

In [71]:
from sklearn.model_selection import GridSearchCV

In [72]:
# Define hyperparameters grid for Logistic Regression
param_grid = {
    'penalty': ['l1', 'l2'],  # Regularization type
    'C': [0.001, 0.01, 0.1, 1, 10, 100]  # Inverse of regularization strength
}

In [73]:
# Perform Grid Search Cross Validation
grid_search = GridSearchCV(lr, param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train_vectorized, y_train)

# Get best hyperparameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best CV Score:", best_score)

30 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1172, in fit
    solver = _check_solver(sel

Best Parameters: {'C': 1, 'penalty': 'l2'}
Best CV Score: 0.7655201863354038


In [74]:
# Passing the parameters and check the accuracy for both training and testing
best_logistic_regression = LogisticRegression(**best_params)

best_logistic_regression.fit(x_train_vectorized, y_train)

# Predicting on training and testing sets
train_predictions = best_logistic_regression.predict(x_train_vectorized)
test_predictions = best_logistic_regression.predict(x_test_vectorized)

# Calculate accuracy on training and testing sets
train_accuracy = accuracy_score(y_train, train_predictions)
test_accuracy = accuracy_score(y_test, test_predictions)

print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)

Training Accuracy: 0.9538653366583542
Testing Accuracy: 0.7761194029850746


There is no improvement in accuracy so we use actual Logistic Regression

#### Saving the model

In [75]:
import pickle

# Model Object
pickle.dump(lr, open('model.pkl', 'wb'))

# Vectorizer
pickle.dump(tfidf, open('vectorizer.pkl', 'wb'))

#### User input prediction

In [77]:
text = 'Hedge funds took profits in these popular technology stocks during the fourth quarter'
# text = 'Russia’s war in Ukraine is igniting an old debate in Brussels over debt'

# Text processing

processed_df = processing(text)

model = pickle.load(open('model.pkl', 'rb'))
vectorizer = pickle.load(open('vectorizer.pkl', 'rb'))

input_vectorized = vectorizer.transform(processed_df['Header'])

input_prediction = model.predict(input_vectorized)

if input_prediction == 0:
    input_prediction = print('Technology')

elif input_prediction == 1:
    input_prediction = print('Automobile')

elif input_prediction == 2:
    input_prediction = print('Health and Science')
    
elif input_prediction == 3:
    input_prediction = print('Investing')
    
elif input_prediction == 4:
    input_prediction = print('Politics')

Investing
