In [1]:
import pandas as pd

df = pd.read_csv("../data/raw/twcs.csv")
df.shape

(2811774, 7)

In [2]:
import sys
import os

# Add project root to Python path
sys.path.append(os.path.abspath(".."))

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [4]:
import pickle

with open('../models/faq_data.pkl', 'rb') as f:
    faq_df = pickle.load(f)

faq_df.head()

Unnamed: 0,question,answer,clean_question,clean_answer
0,@sprintcare I have sent several private messag...,@115712 I understand. I would like to assist y...,i have sent several private messages and no on...,i understand i would like to assist you we wou...
1,@sprintcare I did.,@115712 Please send us a Private Message so th...,i did,please send us a private message so that we ca...
2,@sprintcare is the worst customer service,@115712 Can you please send us a private messa...,is the worst customer service,can you please send us a private message so th...
3,@sprintcare is the worst customer service,@115712 I would love the chance to review the ...,is the worst customer service,i would love the chance to review the account ...
4,@sprintcare is the worst customer service,@115712 Hello! We never like our customers to ...,is the worst customer service,hello we never like our customers to feel like...


In [5]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)        # remove links
    text = re.sub(r"@\w+", "", text)           # remove @mentions
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text) # remove punctuation
    text = re.sub(r"\s+", " ", text).strip()   # remove extra spaces
    return text

In [6]:
faq_df['clean_question'] = faq_df['question'].apply(clean_text)
faq_df['clean_answer'] = faq_df['answer'].apply(clean_text)

In [26]:
faq_df.columns

Index(['text_question', 'text_answer'], dtype='object')

In [27]:
# 1) Keep only minimal required columns
faq_small = faq_df[['text_question', 'text_answer']].copy()

# 2) Rename to match what the Streamlit app expects
faq_small = faq_small.rename(columns={
    'text_question': 'clean_question',
    'text_answer': 'answer'
})

# 3) Remove duplicates
faq_small = faq_small.drop_duplicates().reset_index(drop=True)

# Inspect shape
faq_small.shape

(200, 2)

In [29]:
# --- Save compressed FAQ and vectorizer (corrected paths) ---
import joblib, os
from sklearn.feature_extraction.text import TfidfVectorizer

# 1) Save compressed faq_small  (IMPORTANT: note the ../ )
joblib.dump(faq_small, "../models/faq_data.pkl", compress=3)

# 2) Fit vectorizer on cleaned questions
vectorizer = TfidfVectorizer(stop_words="english", max_features=50000)
vectorizer.fit(faq_small['clean_question'])

# 3) Save compressed vectorizer
joblib.dump(vectorizer, "../models/vectorizer.pkl", compress=3)

# 4) Report sizes (KB)
faq_kb = os.path.getsize("../models/faq_data.pkl") // 1024
vec_kb = os.path.getsize("../models/vectorizer.pkl") // 1024

print("Saved: ../models/faq_data.pkl ->", faq_kb, "KB")
print("Saved: ../models/vectorizer.pkl ->", vec_kb, "KB")

Saved: ../models/faq_data.pkl -> 24 KB
Saved: ../models/vectorizer.pkl -> 13 KB


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_features=50000)

tfidf_matrix = vectorizer.fit_transform(faq_df['clean_question'])
tfidf_matrix.shape

(1261730, 50000)

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def get_response(user_query):
    user_query_clean = clean_text(user_query)
    
    user_vec = vectorizer.transform([user_query_clean])
    
    similarities = cosine_similarity(user_vec, tfidf_matrix).flatten()
    
    best_index = np.argmax(similarities)
    best_score = similarities[best_index]
    
    if best_score < 0.2:
        return "I'm not sure about that. Could you rephrase your question?"
    
    return faq_df.iloc[best_index]['answer']

In [9]:
get_response("My internet is not working")

'@230409 My apologies for the issues you are having with your internet.  Please follow/DM your service phone number and I would be happy to look into this for you. ^JK'

In [10]:
get_response("How can I reset my password?")

"@152535 Hey Marcus, help's arrived! Could you DM us your username and email address? We'll check it out üîç /JY https://t.co/ldFdZRiNAt"

In [11]:
get_response("I want to talk to customer care")

'@279936 You can reach out to our customer care by clicking on the link provided earlier and we will be happy to help you with this. ^KS'

In [12]:
import pickle

with open('../models/vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

with open('../models/faq_data.pkl', 'wb') as f:
    pickle.dump(faq_df, f)

In [13]:
import os
os.getcwd()

'c:\\Users\\HP\\Documents\\FUTURE_ML_03\\notebooks'

In [14]:
from utils.chatbot_engine import ChatbotEngine

In [15]:
chatbot = ChatbotEngine(
    vectorizer_path='../models/vectorizer.pkl',
    faq_data_path='../models/faq_data.pkl'
)

chatbot.get_response("My internet is slow")

'@189420 Can you send me a DM so I can look into your connection? -AG'

In [16]:
import pickle

# Save only the cleaned FAQ dataframe, not the entire dataset
pickle.dump(faq_df, open("../models/faq_data.pkl", "wb"))

In [17]:
pickle.dump(vectorizer, open("../models/vectorizer.pkl", "wb"))

In [18]:
pickle.dump(faq_df, open("../models/faq_data.pkl", "wb"))

In [19]:
faq_df.shape

(1261730, 4)

In [20]:
for var in ['df', 'faq_df', 'clean_df', 'filtered_df', 'qa_pairs_df']:
    try:
        print(var, eval(var).shape)
    except:
        pass

df (2811774, 7)
faq_df (1261730, 4)


In [21]:
# Filter only customer questions and agent responses
customer_questions = df[df["in_response_to_tweet_id"].isnull()]
agent_answers = df[df["in_response_to_tweet_id"].notnull()]

# Merge question-answer pairs (this produces much smaller data)
qa_pairs_df = agent_answers.merge(
    customer_questions[["tweet_id", "text"]],
    how="inner",
    left_on="in_response_to_tweet_id",
    right_on="tweet_id",
    suffixes=("_answer", "_question")
)

qa_pairs_df = qa_pairs_df[["text_question", "text_answer"]]
qa_pairs_df.head()

Unnamed: 0,text_question,text_answer
0,@sprintcare is the worst customer service,@115712 Can you please send us a private messa...
1,@115714 y‚Äôall lie about your ‚Äúgreat‚Äù connectio...,@115713 H there! We'd definitely like to work ...
2,"@115714 whenever I contact customer support, t...",@115715 Please send me a private message so th...
3,actually that's a broken link you sent me and ...,@115716 The information pertaining to the acco...
4,"Yo @Ask_Spectrum, your customer service reps a...","@115717 Hello, My apologies for any frustratio..."


In [22]:
faq_df = qa_pairs_df.sample(200, random_state=42)
faq_df.shape

(200, 2)

In [23]:
import pickle

pickle.dump(faq_df, open("../models/faq_data.pkl", "wb"))

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words="english")
vectorizer.fit(faq_df["text_question"])

pickle.dump(vectorizer, open("../models/vectorizer.pkl", "wb"))