## Load the CSV and Inspect Basic Structure

In [None]:
import pandas as pd

#Load dataset
df = pd.read_csv('D:/PROJECTS/Version Control/CHATBOT/building-smart-bots-from-scratch/data/Conversation.csv')

In [24]:
# print shape and first few rows
print(f"Dataset shape: {df.shape} rows x columns \n")

print("First 5 rows:" + "\n", df.head())

Dataset shape: (3725, 3) rows x columns 

First 5 rows:
    Serial no.                             question   
0           0               hi, how are you doing?  \
1           1        i'm fine. how about yourself?   
2           2  i'm pretty good. thanks for asking.   
3           3    no problem. so how have you been?   
4           4     i've been great. what about you?   

                                     answer  
0             i'm fine. how about yourself?  
1       i'm pretty good. thanks for asking.  
2         no problem. so how have you been?  
3          i've been great. what about you?  
4  i've been good. i'm in school right now.  


In [25]:
#List column names explicitly

print("Column names:")
print(df.columns.tolist(), "\n")

Column names:
['Serial no.', 'question', 'answer'] 



## Check for Missing / NaN Values

In [23]:
#Check for any missing values per column

print("Missing value counts:")
print(df.isna().sum(), "\n")

Missing value counts:
Serial no.    0
question      0
answer        0
dtype: int64 



## Inspect Data Types & Unique Values

In [30]:
# Check data types

print("Data types:")
print(df.dtypes, "\n")

# 7. If Serial No exists, is it truly unique?
if 'Serial no.' in df.columns:
    unique_serials = df['Serial no.'].nunique()
    total_rows = df.shape[0]

    print(f"Unique Serial No: {unique_serials} out of {total_rows}")


Data types:
Serial no.     int64
question      object
answer        object
dtype: object 

Unique Serial No: 3725 out of 3725


## Detect Duplicates

In [31]:
# Check for exact duplicate rows
dup_full = df.duplicated(keep=False)  # marks all rows that have a full duplicate
num_dup_full = dup_full.sum()

print(f"Full-row duplicates found: {num_dup_full}")


if num_dup_full > 0:
    print(df[dup_full].head(), "\n")
else:
    print("No exact full-row duplicates.\n")


Full-row duplicates found: 0
No exact full-row duplicates.



In [33]:
# Are there duplicate Questions? (i.e., same text but maybe different answers)
dup_q = df.duplicated(subset=['question'], keep=False)
num_dup_q = dup_q.sum()

print(f"Rows with duplicate 'question' text: {num_dup_q}")


if num_dup_q > 0:
    print(df[dup_q].sort_values('question').head(10), "\n")
else:
    print("No duplicate Questions.\n")


Rows with duplicate 'question' text: 317
      Serial no.                     question   
1619        1619               anything else?  \
1583        1583               anything else?   
596          596             are you serious?   
243          243             are you serious?   
1136        1136                are you sure?   
3275        3275                are you sure?   
2665        2665                are you sure?   
1842        1842                are you sure?   
474          474  did you go to school today?   
422          422  did you go to school today?   

                                             answer  
1619                             i need a notebook.  
1583                       yes. it's not expensive!  
596              i haven't heard anything about it.  
243                       yes, i am really excited.  
1136                             it's almost empty.  
3275          we will be house rich, but cash poor.  
2665                            of course

## Analyze Text Lengths

In [36]:
# Add new columns for text lengths (characters)
df['q_len_chars'] = df['question'].astype(str).map(len)
df['a_len_chars'] = df['answer'].astype(str).map(len)

# 11. Basic statistics on lengths
print("Question length (chars) stats:")
print("Answer length (chars) stats:")
print(df['a_len_chars'].describe(), "\n")

# 12. (Optional) Sort by longest/shortest question
print("5 Longest Questions (chars):")
print(df.sort_values('q_len_chars', ascending=False)[['question', 'q_len_chars']].head(), "\n")

print("5 Shortest Questions (chars):")
print(df.sort_values('q_len_chars', ascending=True)[['question', 'q_len_chars']].head(), "\n")


Question length (chars) stats:
Answer length (chars) stats:
count    3725.000000
mean       32.213154
std        14.545945
min         3.000000
25%        21.000000
50%        30.000000
75%        41.000000
max        97.000000
Name: a_len_chars, dtype: float64 

5 Longest Questions (chars):
                                               question  q_len_chars
2634  what if you fall while you're holding the ligh...           97
3292  when you're inside, you will always hear cars ...           88
3207  i mean, someone used their dirty hands to pick...           88
2329  but when i sneak just one cigarette in the mor...           82
1812  because you'll have an accident. most accident...           82 

5 Shortest Questions (chars):
     question  q_len_chars
3611      so?            3
1294      no.            3
3083      oh.            3
3289      so?            3
947       so?            3 



## Quick Vocabulary Check (Most Common Words)


In [38]:
import re
from collections import Counter

# Tokenize (simple whitespace + punctuation split)
def tokenize(text):
    # Lowercase + split on non-word characters
    tokens = re.findall(r"\b\w+\b", str(text).lower())
    return tokens

# Build one big list of all question‐tokens
all_q_tokens = []
for q in df['question'].astype(str):
    all_q_tokens.extend(tokenize(q))

# Count top 15 most common words
counter_q = Counter(all_q_tokens)
most_common_q = counter_q.most_common(15)

print("Top 15 most common words in Questions:")

for word, freq in most_common_q:
    print(f"{word.ljust(15)} → {freq}")
print()


Top 15 most common words in Questions:
i               → 1270
you             → 972
the             → 764
to              → 683
it              → 652
a               → 634
s               → 501
that            → 454
t               → 391
what            → 348
do              → 302
is              → 262
of              → 240
and             → 225
have            → 218



## Spot-Check Weird or Malformed Rows

In [39]:
# Detect non-ASCII characters
def has_non_ascii(s):
    try:
        s.encode('ascii')
        return False
    except UnicodeEncodeError:
        return True

mask_non_ascii_q = df['question'].astype(str).map(has_non_ascii)
mask_non_ascii_a = df['answer'].astype(str).map(has_non_ascii)

print(f"Questions with non-ASCII chars: {mask_non_ascii_q.sum()}")
print(df[mask_non_ascii_q][['question']].head(), "\n")

print(f"Answers with non-ASCII chars: {mask_non_ascii_a.sum()}")
print(df[mask_non_ascii_a][['answer']].head(), "\n")

Questions with non-ASCII chars: 6
                                               question
858   different thingsÂ—not the same thing all the t...
1523     look at the bottom of my shoesÂ—they're clean.
2097               "Â…and you know you should be glad!"
2100                  oh, yes! "let it be, let it beÂ…"
2328  you don't need a good nose for thatÂ—cigarette... 

Answers with non-ASCII chars: 13
                                                 answer
857   different thingsÂ—not the same thing all the t...
1522     look at the bottom of my shoesÂ—they're clean.
2096               "Â…and you know you should be glad!"
2099                  oh, yes! "let it be, let it beÂ…"
2100            "Â…there will be an answer, let it be!" 



# Data Cleaning & Preprocessing

####   1. Loads original CSV from ../data/
####   2. Renames columns to snake_case
####   3. Strips whitespace
####   4. Normalizes/removes non-ASCII artifacts
####   5. Deduplicates based on 'question' (keeps first)
####   6. Lowercases text
####   7. Saves to ../data/clean_conversation_dataset.csv

In [40]:
import pandas as pd
import unicodedata
import re

# 1) Load original CSV
df = pd.read_csv('D:/PROJECTS/Version Control/CHATBOT/building-smart-bots-from-scratch/data/Conversation.csv')

# 2) Rename columns to snake_case
df = df.rename(columns={'Serial no.': 'serial_no'})

# 3) Strip leading/trailing whitespace from question & answer
df['question'] = df['question'].astype(str).str.strip()
df['answer']   = df['answer'].astype(str).str.strip()

In [41]:
# 4) Normalize Unicode & remove leftover “Â” artifacts
def normalize_text(text):
    # Normalize Unicode (NFKD form) 
    # decomposes characters into ASCII + diacritics
    normalized = unicodedata.normalize('NFKD', text)
    
    # Encode to ASCII and ignore non-ASCII (dropping any leftover accent/Â)
    ascii_bytes = normalized.encode('ascii', 'ignore')
    ascii_str   = ascii_bytes.decode('ascii')
    
    # Remove any weird control chars or multiple spaces
    # (e.g. leftover zero-width spaces, weird punctuation)
    ascii_str = re.sub(r'\s+', ' ', ascii_str)  # collapse multiple whitespace
    ascii_str = ascii_str.strip()
    
    return ascii_str

# Apply normalization to all questions & answers
df['question'] = df['question'].map(normalize_text)
df['answer']   = df['answer'].map(normalize_text)

In [45]:
# 5) Deduplicate based on 'question' (keep the FIRST occurrence)
#    We saw 317 rows where 'question' text was identical. For a Level-1 rule-based bot,
#    we'll keep only the first answer for any duplicated question.

df_before = df.shape[0]
df = df.drop_duplicates(subset=['question'], keep='first').reset_index(drop=True)

df_after = df.shape[0]
print(f"Dropped {df_before - df_after} duplicate questions. New row count: {df_after}")

Dropped 0 duplicate questions. New row count: 3510


In [46]:
# 6) Lowercase all text (optional but recommended for TF-IDF matching)
df['question'] = df['question'].str.lower()
df['answer']   = df['answer'].str.lower()

In [44]:
# 7) Save cleaned CSV for future use

output_path = 'D:/PROJECTS/Version Control/CHATBOT/building-smart-bots-from-scratch/data/clean_conversation_dataset.csv'
df.to_csv(output_path, index=False)
print(f"Cleaned dataset saved to: {output_path}")

Cleaned dataset saved to: D:/PROJECTS/Version Control/CHATBOT/building-smart-bots-from-scratch/data/clean_conversation_dataset.csv


In [None]:
import pandas as pd
df_clean = pd.read_csv('D:/PROJECTS/Version Control/CHATBOT/building-smart-bots-from-scratch/data/clean_conversation_dataset.csv')
print(df_clean.head())
print(df_clean.shape)


   serial_no                             question   
0          0               hi, how are you doing?  \
1          1        i'm fine. how about yourself?   
2          2  i'm pretty good. thanks for asking.   
3          3    no problem. so how have you been?   
4          4     i've been great. what about you?   

                                     answer  
0             i'm fine. how about yourself?  
1       i'm pretty good. thanks for asking.  
2         no problem. so how have you been?  
3          i've been great. what about you?  
4  i've been good. i'm in school right now.  
(3510, 3)


In [49]:
def has_non_ascii(s):
    try:
        s.encode('ascii')
        return False
    except UnicodeEncodeError:
        return True

print("Any non-ASCII in questions?", df_clean['question'].map(has_non_ascii).sum())
print("Any non-ASCII in answers?",   df_clean['answer'].map(has_non_ascii).sum())


Any non-ASCII in questions? 0
Any non-ASCII in answers? 0
