In [None]:
import numpy as np
import pandas as pd
df = pd.read_csv('/content/Data_Train.csv', encoding='ISO-8859-1')
df_test=pd.read_csv('/content/Data_Test.csv', encoding='ISO-8859-1')

df.head()

Unnamed: 0,STORY,SECTION
0,But the most painful was the huge reversal in ...,3
1,How formidable is the opposition alliance amon...,0
2,Most Asian currencies were trading lower today...,3
3,"If you want to answer any question, click on ...",1
4,"In global markets, gold prices edged up today ...",3


In [None]:
df.columns

Index(['STORY', 'SECTION'], dtype='object')

In [None]:
df['SECTION'].unique()

array([3, 0, 1, 2])

In [None]:
for section in df['SECTION'].unique():
    print(f"\n📘  SECTION = {section}\n")
    display(df[df['SECTION'] == section].head(5))


📘  SECTION = 3



Unnamed: 0,STORY,SECTION
0,But the most painful was the huge reversal in ...,3
2,Most Asian currencies were trading lower today...,3
4,"In global markets, gold prices edged up today ...",3
6,Mumbai: India Inc's external commercial borrow...,3
7,"On Wednesday, Federal Reserve Chairman Jerome ...",3



📘  SECTION = 0



Unnamed: 0,STORY,SECTION
1,How formidable is the opposition alliance amon...,0
24,This story has been published from a wire agen...,0
26,The statements in which Yeddyurappa says that ...,0
27,"As NDA seeks re-election, agriculture will for...",0
28,Yeddyurappa said the IAF air strikes would ben...,0



📘  SECTION = 1



Unnamed: 0,STORY,SECTION
3,"If you want to answer any question, click on ...",1
5,BEIJING: Chinese tech giant Huawei has announc...,1
10,One would think that their development and te...,1
12,"Xiaomi, however, sees the presence of Jio in r...",1
13,"The ad reads ""No bells & whistles. No Bezel. N...",1



📘  SECTION = 2



Unnamed: 0,STORY,SECTION
8,What more can you give to the audience? I have...,2
9,"com, Arbaaz Khan spoke about getting back to D...",2
18,"He chooses to hide his CP from colleagues, mov...",2
21,"Starring Varun Dhawan, Alia Bhatt, Sonakshi Si...",2
29,"With two releases this year so far, Milan Tal...",2


In [None]:
section_map={
    0:"Politics",
    1:"Techology",
    2:"Sports",
    3:"Business"
}
df['CATEGORY'] = df['SECTION'].map(section_map)


In [None]:
df.head()

Unnamed: 0,STORY,SECTION,CATEGORY
0,But the most painful was the huge reversal in ...,3,Business
1,How formidable is the opposition alliance amon...,0,Politics
2,Most Asian currencies were trading lower today...,3,Business
3,"If you want to answer any question, click on ...",1,Techology
4,"In global markets, gold prices edged up today ...",3,Business


In [None]:
import re
import nltk
#from nltk.corups import stopwords
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')


#stop_word=set(stopwords.word('english'))
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
def preprocess_text(text):
  text=text.lower()
  text=re.sub(r'[^a-z\s]', '', text)
  tokens=nltk.word_tokenize(text)
  tokens=[lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
  return ' '.join(tokens)

df['CLEAN_STORY'] = df['STORY'].apply(preprocess_text)


print(df[['STORY', 'CLEAN_STORY']].head())

                                               STORY  \
0  But the most painful was the huge reversal in ...   
1  How formidable is the opposition alliance amon...   
2  Most Asian currencies were trading lower today...   
3  If you want to answer any question, click on ...   
4  In global markets, gold prices edged up today ...   

                                         CLEAN_STORY  
0  painful huge reversal fee income unheard among...  
1  formidable opposition alliance among congress ...  
2  asian currency trading lower today south korea...  
3  want answer question click answer clicking ans...  
4  global market gold price edged today disappoin...  


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
#tfidf = TfidfVectorizer(max_features=5000)
#X = tfidf.fit_transform(df['CLEAN_STORY'])
y = df['SECTION']
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['CLEAN_STORY'])
y = df['SECTION']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = MultinomialNB()
model.fit(X_train, y_train)

In [None]:
model.score(X_train,y_train)

0.9732874467387742

In [None]:
from sklearn.metrics import accuracy_score, classification_report
y_pred_nb = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))

Accuracy: 0.9626474442988204
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       323
           1       0.96      0.97      0.96       549
           2       0.97      0.96      0.96       402
           3       0.95      0.98      0.96       252

    accuracy                           0.96      1526
   macro avg       0.96      0.96      0.96      1526
weighted avg       0.96      0.96      0.96      1526



In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("🔹 Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


🔹 Random Forest Accuracy: 0.953473132372215
              precision    recall  f1-score   support

           0       0.96      0.93      0.95       323
           1       0.95      0.96      0.96       549
           2       0.94      0.97      0.96       402
           3       0.95      0.95      0.95       252

    accuracy                           0.95      1526
   macro avg       0.95      0.95      0.95      1526
weighted avg       0.95      0.95      0.95      1526



In [None]:
from sklearn.svm import LinearSVC

svm = LinearSVC()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

print("🔹 SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))


🔹 SVM Accuracy: 0.9770642201834863
              precision    recall  f1-score   support

           0       0.98      0.96      0.97       323
           1       0.99      0.98      0.98       549
           2       0.97      0.99      0.98       402
           3       0.96      0.98      0.97       252

    accuracy                           0.98      1526
   macro avg       0.98      0.98      0.98      1526
weighted avg       0.98      0.98      0.98      1526



In [None]:
df_test.head()

Unnamed: 0,STORY
0,2019 will see gadgets like gaming smartphones ...
1,It has also unleashed a wave of changes in the...
2,It can be confusing to pick the right smartpho...
3,The mobile application is integrated with a da...
4,We have rounded up some of the gadgets that sh...


In [None]:
df_test['CLEAN_STORY'] = df_test['STORY'].apply(preprocess_text)
df_test.head()

Unnamed: 0,STORY,CLEAN_STORY
0,2019 will see gadgets like gaming smartphones ...,see gadget like gaming smartphones wearable me...
1,It has also unleashed a wave of changes in the...,also unleashed wave change mcu make sure futur...
2,It can be confusing to pick the right smartpho...,confusing pick right smartphone segregated top...
3,The mobile application is integrated with a da...,mobile application integrated dashboard confir...
4,We have rounded up some of the gadgets that sh...,rounded gadget showed left indelible mark cons...


In [None]:
X_test_final = tfidf.transform(df_test['CLEAN_STORY'])
X_test_final

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 111129 stored elements and shape (2748, 5000)>

In [None]:
df_test['PREDICTED_SECTION'] = model.predict(X_test_final)

In [None]:
section_map = {0: "Politics", 1: "Technology", 2: "Entertainment", 3: "Business"}
df_test['PREDICTED_CATEGORY'] = df_test['PREDICTED_SECTION'].map(section_map)


In [None]:
print(df_test[['STORY', 'PREDICTED_SECTION', 'PREDICTED_CATEGORY']])

                                                  STORY  PREDICTED_SECTION  \
0     2019 will see gadgets like gaming smartphones ...                  1   
1     It has also unleashed a wave of changes in the...                  2   
2     It can be confusing to pick the right smartpho...                  1   
3     The mobile application is integrated with a da...                  0   
4     We have rounded up some of the gadgets that sh...                  1   
...                                                 ...                ...   
2743  According to researchers, fraud in the mobile ...                  1   
2744  The iPhone XS and XS Max share the Apple A12 c...                  1   
2745  On the photography front, the Note 5 Pro featu...                  1   
2746  UDAY mandated that discoms bring the gap betwe...                  0   
2747  Ripple also helps bank customers send money to...                  1   

     PREDICTED_CATEGORY  
0            Technology  
1         E

In [None]:
import joblib

# Save model and vectorizer
joblib.dump(model, 'model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']