In [1]:
!pip install pandas numpy scikit-learn nltk gensim matplotlib seaborn




In [2]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [3]:
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
!pip install kagglehub
import kagglehub

# download SMS Spam dataset
path = kagglehub.dataset_download("uciml/sms-spam-collection-dataset")

print("Dataset downloaded to:", path)



Dataset downloaded to: C:\Users\Admn\.cache\kagglehub\datasets\uciml\sms-spam-collection-dataset\versions\1


In [5]:
import os

# show files inside downloaded dataset folder
dataset_path = r"C:\Users\Admn\.cache\kagglehub\datasets\uciml\sms-spam-collection-dataset\versions\1"

os.listdir(dataset_path)


['spam.csv']

In [6]:
# load dataset
file_path = r"C:\Users\Admn\.cache\kagglehub\datasets\uciml\sms-spam-collection-dataset\versions\1\spam.csv"

df = pd.read_csv(file_path, encoding='latin-1')

# keep only needed columns
df = df[['v1','v2']]
df.columns = ['label','text']

df.head()


Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # convert to lowercase
    text = text.lower()
    
    # remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # remove stopwords
    words = text.split()
    words = [w for w in words if w not in stop_words]
    
    return " ".join(words)

# apply cleaning
df['clean_text'] = df['text'].apply(clean_text)

df.head()


Unnamed: 0,label,text,clean_text
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think goes usf lives around though


In [8]:
# features and labels
X = df['clean_text']
y = df['label']

# split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training samples:", len(X_train))
print("Testing samples:", len(X_test))


Training samples: 4457
Testing samples: 1115


In [9]:
# convert text into numbers using CountVectorizer
count_vec = CountVectorizer()

X_train_count = count_vec.fit_transform(X_train)
X_test_count = count_vec.transform(X_test)

# train Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_count, y_train)

# predictions
pred_nb = nb_model.predict(X_test_count)

# evaluation
print("Accuracy:", accuracy_score(y_test, pred_nb))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, pred_nb))
print("\nClassification Report:\n", classification_report(y_test, pred_nb))


Accuracy: 0.9802690582959641

Confusion Matrix:
 [[961   4]
 [ 18 132]]

Classification Report:
               precision    recall  f1-score   support

         ham       0.98      1.00      0.99       965
        spam       0.97      0.88      0.92       150

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [10]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer()

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Logistic Regression Model
lr_model = LogisticRegression()
lr_model.fit(X_train_tfidf, y_train)

# predictions
pred_lr = lr_model.predict(X_test_tfidf)

# evaluation
print("Accuracy:", accuracy_score(y_test, pred_lr))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, pred_lr))
print("\nClassification Report:\n", classification_report(y_test, pred_lr))


Accuracy: 0.9515695067264573

Confusion Matrix:
 [[961   4]
 [ 50 100]]

Classification Report:
               precision    recall  f1-score   support

         ham       0.95      1.00      0.97       965
        spam       0.96      0.67      0.79       150

    accuracy                           0.95      1115
   macro avg       0.96      0.83      0.88      1115
weighted avg       0.95      0.95      0.95      1115



In [11]:
# get feature names
feature_names = tfidf.get_feature_names_out()

# get model coefficients
coefficients = lr_model.coef_[0]

# create dataframe of words and importance
word_importance = pd.DataFrame({
    'word': feature_names,
    'importance': coefficients
})

# top spam words (positive values)
top_spam = word_importance.sort_values(by='importance', ascending=False).head(10)

# top ham words (negative values)
top_ham = word_importance.sort_values(by='importance').head(10)

print("Top Spam Words:\n")
print(top_spam)

print("\nTop Ham Words:\n")
print(top_ham)


Top Spam Words:

         word  importance
7488      txt    4.612192
1909    claim    3.926982
3113     free    3.795109
6856     stop    3.778675
4789   mobile    3.701939
1661     call    3.213904
6039    reply    3.122367
5734    prize    2.956106
7160     text    2.917993
6364  service    2.605520

Top Ham Words:

       word  importance
3795     im   -2.025610
4478   ltgt   -2.021118
3792    ill   -1.908342
5207     ok   -1.899597
6519    sir   -1.504458
1986   come   -1.474839
4247  later   -1.434345
2217     da   -1.316825
3345    got   -1.285468
3310  going   -1.282684


In [12]:
import gensim.downloader as api

# load pretrained Word2Vec model
w2v_model = api.load("word2vec-google-news-300")


In [13]:
# lightweight pretrained embedding model (fast)
w2v_model = api.load("glove-wiki-gigaword-50")

print("Model loaded!")


Model loaded!


In [14]:
# similar words to "king"
w2v_model.most_similar("king")


[('prince', 0.8236179351806641),
 ('queen', 0.7839043140411377),
 ('ii', 0.7746230363845825),
 ('emperor', 0.7736247777938843),
 ('son', 0.766719400882721),
 ('uncle', 0.7627150416374207),
 ('kingdom', 0.7542160749435425),
 ('throne', 0.7539914846420288),
 ('brother', 0.7492412328720093),
 ('ruler', 0.7434253096580505)]

In [15]:
# analogy example
w2v_model.most_similar(
    positive=["woman", "king"],
    negative=["man"]
)


[('queen', 0.8523604869842529),
 ('throne', 0.7664334177970886),
 ('prince', 0.759214460849762),
 ('daughter', 0.7473882436752319),
 ('elizabeth', 0.7460220456123352),
 ('princess', 0.7424570322036743),
 ('kingdom', 0.7337412238121033),
 ('monarch', 0.721449077129364),
 ('eldest', 0.7184861898422241),
 ('widow', 0.7099431157112122)]

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

# predefined questions
questions = [
    "what is your name",
    "how are you",
    "what is machine learning",
    "what is python",
    "what is nlp",
    "who created python",
    "what is data science",
    "what is artificial intelligence"
]

# answers
answers = [
    "I am an NLP internship chatbot.",
    "I am doing great!",
    "Machine learning is a field where computers learn from data.",
    "Python is a programming language used in AI.",
    "NLP means Natural Language Processing.",
    "Python was created by Guido van Rossum.",
    "Data science is extracting insights from data.",
    "Artificial Intelligence means machines that think like humans."
]


# convert questions into vectors
vectorizer = TfidfVectorizer()
Q_vec = vectorizer.fit_transform(questions)

# chatbot function
def chatbot(user_input):
    user_vec = vectorizer.transform([user_input])
    sim = cosine_similarity(user_vec, Q_vec)
    idx = sim.argmax()
    return answers[idx]


In [18]:
print(chatbot("tell me about python"))
print(chatbot("what is ai"))
print(chatbot("how are you"))


Python is a programming language used in AI.
Python is a programming language used in AI.
I am doing great!


In [21]:
# Task 3 – NLP Internship Assignment
## Text Classification, Word Embeddings & FAQ Chatbot


In [23]:
# Part 1 – Text Classification Project


In [24]:
# Part 2 – Word Embedding Mini Task


In [25]:
# Part 3 – Mini NLP Application – FAQ Chatbot
