In [2]:
from processor import processing
import pandas as pd

# Train Test Split
from sklearn.model_selection import train_test_split

# Vectorisation module
from sklearn.feature_extraction.text import CountVectorizer

# Classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

# Evaluation metrics
from sklearn.metrics import accuracy_score

In [48]:
df = pd.read_csv(r'F:\GUVI_DATA_SCIENCE\Project\Automated_NewsClassifier_NLP\Datasets\processed_dataset.csv')
df.sample()

Unnamed: 0,Header,Section
755,mark cuban say plan run president heel dalla...,Politics


In [16]:
df['Section'].value_counts()

Section
Technology            201
Automobile            200
Health_and_Science    200
Investing             200
Politics              200
Name: count, dtype: int64

In [17]:
df['Target'] = df['Section'].map({'Technology':0, 'Automobile':1,'Health_and_Science':2, 'Investing':3, 'Politics':4})

##### **Model Training**

In [18]:
x = df['Header']
y = df['Target']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)
print(x_train.shape, x_test.shape)

(800,) (201,)


##### **Vectorisation of Text**

In [19]:
cv = CountVectorizer()

x_train_vectorised_1 = cv.fit_transform(x_train)
x_test_vectorised_1  = cv.transform(x_test)

##### **Fitting the model**

In [20]:
def model_training(model_name, x_train, y_train):
    if model_name == 'RandomForestClassifier':
        model = RandomForestClassifier(random_state = 42)


    elif model_name == 'XGBClassifier':
        model = XGBClassifier(random_state = 42)


    elif model_name == 'SVC':
        model = SVC(random_state = 42)
 

    elif model_name == 'KNeighborsClassifier':
        model = KNeighborsClassifier()        

    else:
        raise ValueError('Invalid Model name')
    
    model.fit(x_train, y_train)

    return model

##### **Random Forest Classifier**

In [21]:
rfc = model_training('RandomForestClassifier', x_train_vectorised_1, y_train)

y_train_pred_1 = rfc.predict(x_train_vectorised_1)
y_test_pred_1 = rfc.predict(x_test_vectorised_1)

##### **XGBClassifier**

In [22]:
xgbc = model_training('XGBClassifier', x_train_vectorised_1, y_train)

y_train_pred_2 = xgbc.predict(x_train_vectorised_1)
y_test_pred_2 = xgbc.predict(x_test_vectorised_1)

##### **SVC**

In [23]:
svc = model_training('SVC', x_train_vectorised_1, y_train)

y_train_pred_3 = svc.predict(x_train_vectorised_1)
y_test_pred_3 = svc.predict(x_test_vectorised_1)

##### **KNeighborsClassifier**

In [24]:
knc = model_training('KNeighborsClassifier', x_train_vectorised_1, y_train)

y_train_pred_4 = knc.predict(x_train_vectorised_1)
y_test_pred_4 = knc.predict(x_test_vectorised_1)

##### **Evaluation**

In [25]:
def evaluation_metrics(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)

    return f'{round(accuracy*100)}%'

##### **Random Forest Classifier**

In [26]:
train_score_1 = evaluation_metrics(y_train, y_train_pred_1)
print('The Accuracy score of training set :',train_score_1)

test_score_1 = evaluation_metrics(y_test, y_test_pred_1)
print('The Accuracy score of testing set :',test_score_1)

The Accuracy score of training set : 99%
The Accuracy score of testing set : 72%


##### **XGBClassifier**

In [27]:
train_score_2 = evaluation_metrics(y_train, y_train_pred_2)
print('The Accuracy score of training set :',train_score_2)

test_score_2 = evaluation_metrics(y_test, y_test_pred_2)
print('The Accuracy score of testing set :',test_score_2)

The Accuracy score of training set : 96%
The Accuracy score of testing set : 69%


##### **SVC**

In [28]:
train_score_3 = evaluation_metrics(y_train, y_train_pred_3)
print('The Accuracy score of training set :',train_score_3)

test_score_3 = evaluation_metrics(y_test, y_test_pred_3)
print('The Accuracy score of testing set :',test_score_3)

The Accuracy score of training set : 98%
The Accuracy score of testing set : 71%


##### **KNeighborsClassifier**

In [29]:
train_score_4 = evaluation_metrics(y_train, y_train_pred_4)
print('The Accuracy score of training set :',train_score_4)

test_score_4 = evaluation_metrics(y_test, y_test_pred_4)
print('The Accuracy score of testing set :',test_score_4)

The Accuracy score of training set : 65%
The Accuracy score of testing set : 45%


##### *Conclusion - RandomforestClassifier Performs better for this Scenario with a Accuracy score of 99% in training set and a Accuracy score of 72% in testing set*

##### **Saving the model**

In [3]:
import pickle

In [33]:
# Model Object
pickle.dump(rfc, open(r'F:\GUVI_DATA_SCIENCE\Project\Automated_NewsClassifier_NLP\Artifacts\model.pkl', 'wb'))

# Vectorizer
pickle.dump(cv, open(r'F:\GUVI_DATA_SCIENCE\Project\Automated_NewsClassifier_NLP\Artifacts\vectorizer.pkl', 'wb'))

##### **User Input Prediction**

In [13]:
text = 'Microsoft’s upcoming Surface lineup will feature a next-gen NPU: Report'

# Text processing

processed_df = processing(text)

model = pickle.load(open(r'F:\GUVI_DATA_SCIENCE\Project\Automated_NewsClassifier_NLP\Artifacts\model.pkl', 'rb'))
vectorizer = pickle.load(open(r'F:\GUVI_DATA_SCIENCE\Project\Automated_NewsClassifier_NLP\Artifacts\vectorizer.pkl', 'rb'))

input_vectorized = vectorizer.transform(processed_df['Header'])

input_prediction = model.predict(input_vectorized)

print(input_prediction[0])

0
