# 1. Install important dependencies

In [1]:
!pip install nltk datasets numpy seaborn pandas scikit-learn matplotlib



# 2. Import Dependencies

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import joblib

In [3]:
# Download NLTK stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# Loading the train data
df = pd.read_csv('data/train.csv')

In [5]:
df.head(2)

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...


# 3. Data Preprocessing

In [6]:
# Rename "Class Index" to "Labels"
df = df.rename(columns={'Class Index':'Labels'})

# Combine 2 columns as one column Text
df['Text'] = df['Title'] + ' ' + df['Description']

# Drop Text and Description
df = df.drop(['Title', 'Description'], axis=1)




In [7]:
# Removing Punctuation and unnecessary symbols
def remove_punctuations(text):
    text=re.sub(r'[\\-]',' ',text)
    text=re.sub(r'[,.?;:\'(){}!|0-9]','',text)
    return text

# the apply method applies a function along an axis of dataframe
df['Text']=df['Text'].apply(remove_punctuations)

In [8]:
# Removing unnecessary symbols and punctuation marks

def preprocess_text(text):
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    text = ' '.join([word for word in words if word not in stop_words])
    
    return text

In [9]:
# Apply text preprocessing to 'Text' column
df['Text'] = df['Text'].apply(preprocess_text)

In [10]:
df.head(10)

Unnamed: 0,Labels,Text
0,3,wall st bears claw back black reuters reuters ...
1,3,carlyle looks toward commercial aerospace reut...
2,3,oil economy cloud stocks outlook reuters reute...
3,3,iraq halts oil exports main southern pipeline ...
4,3,oil prices soar time record posing new menace ...
5,3,stocks end near year lows reuters reuters stoc...
6,3,money funds fell latest week ap ap assets nati...
7,3,fed minutes show dissent inflation usatodaycom...
8,3,safety net forbescom forbescom earning phd soc...
9,3,wall st bears claw back black new york reuters...


# 4. Splitting the training data into train and test data

In [11]:
# Split the data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [12]:
# Vectorize the preprocessed text data using TF-IDF representation
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
X_train = tfidf_vectorizer.fit_transform(train_df['Text'])
X_test = tfidf_vectorizer.transform(test_df['Text'])


In [13]:
# Target labels
y_train = train_df['Labels']
y_test = test_df['Labels']

# 5. Model training

In [14]:
# Train a Multinomial Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

In [15]:
# Saving the TF-IDF vectorizer and the classifier

joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')
joblib.dump(classifier, 'multinomial_nb_model.joblib')

['multinomial_nb_model.joblib']

# 6. Model Evaluation on test data

In [16]:
# Make predictions on the test set
predictions = classifier.predict(X_test)

In [17]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))

Accuracy: 0.8927916666666667

Classification Report:
               precision    recall  f1-score   support

           1       0.90      0.89      0.89      5956
           2       0.93      0.97      0.95      6058
           3       0.87      0.84      0.86      5911
           4       0.87      0.86      0.87      6075

    accuracy                           0.89     24000
   macro avg       0.89      0.89      0.89     24000
weighted avg       0.89      0.89      0.89     24000



# 7. Load Model

In [18]:
df_test = pd.read_csv("data/test.csv")

In [19]:

df_test = df_test.rename(columns={'Class Index':'Labels'})


df_test['Text'] = df_test['Title'] + ' ' + df_test['Description']


df_test = df_test.drop(['Title', 'Description'], axis=1)


def remove_punctuations(text):
    text=re.sub(r'[\\-]',' ',text)
    text=re.sub(r'[,.?;:\'(){}!|0-9]','',text)
    return text

# the apply method applies a function along an axis of dataframe
df_test['Text']=df_test['Text'].apply(remove_punctuations)



# Function for text preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    text = ' '.join([word for word in words if word not in stop_words])
    
    return text


df_test['Text'] = df_test['Text'].apply(preprocess_text)

In [20]:
df_test['Text'][1]

'race second private team sets launch date human spaceflight spacecom spacecom toronto canada second team rocketeers competing # million ansari x prize contest privately funded suborbital space flight officially announced first launch date manned rocket'

In [21]:
df_test['Labels'][1]

4

In [22]:

# Load the TF-IDF vectorizer and the classifier
loaded_tfidf_vectorizer = joblib.load('tfidf_vectorizer.joblib')
loaded_classifier = joblib.load('multinomial_nb_model.joblib')



In [23]:
# Making predictions on new data using the loaded model
new_data = ['race second private team sets launch date human spaceflight spacecom spacecom toronto canada second team rocketeers competing # million ansari x prize contest privately funded suborbital space flight officially announced first launch date manned rocket']
new_data_transformed = loaded_tfidf_vectorizer.transform(new_data)
new_predictions = loaded_classifier.predict(new_data_transformed)
print("Predictions on new data:", new_predictions)

Predictions on new data: [4]
