In [4]:
# Load the manually downloaded WikiText-2 file
with open("test.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()


In [5]:
# Remove empty lines and join into a single string
text = "\n".join([line.strip() for line in lines if line.strip() != ""])


In [6]:
print("✅ File loaded successfully.")
print("Total characters:", len(text))
print("Preview:\n")
print(text[:500])


✅ File loaded successfully.
Total characters: 1115427
Preview:

"
= Tropical Storm <unk> ( 2008 ) =
Tropical Storm <unk> was the tenth tropical storm of the 2008 Atlantic hurricane season . <unk> developed out of a strong tropical wave which moved off the African coast on August 31 . The wave quickly became organized and was declared Tropical Depression Ten while located 170 mi ( 270 km ) to the south @-@ southeast of the Cape Verde Islands on September 2 . The depression was quickly upgraded to Tropical Storm <unk> around noon the same day . Over the next s


In [7]:
import re

# Optional cleanup: remove <unk> tokens and section headers like "= ="
cleaned_text = re.sub(r"<unk>", "", text)                  # remove <unk>
cleaned_text = re.sub(r"=+.*=+", "", cleaned_text)         # remove section headers
cleaned_text = re.sub(r"[^\w\s.,!?']", " ", cleaned_text)  # remove special symbols except basic punctuation
cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()   # normalize whitespace


In [8]:
print("Preview:\n")
print(text[:500])

Preview:

"
= Tropical Storm <unk> ( 2008 ) =
Tropical Storm <unk> was the tenth tropical storm of the 2008 Atlantic hurricane season . <unk> developed out of a strong tropical wave which moved off the African coast on August 31 . The wave quickly became organized and was declared Tropical Depression Ten while located 170 mi ( 270 km ) to the south @-@ southeast of the Cape Verde Islands on September 2 . The depression was quickly upgraded to Tropical Storm <unk> around noon the same day . Over the next s


In [9]:
# Split the cleaned text into tokens (words)
tokens = cleaned_text.split()
print("Total tokens:", len(tokens))
print("First 20 tokens:", tokens[:20])


Total tokens: 189387
First 20 tokens: ['Tropical', 'Storm', 'was', 'the', 'tenth', 'tropical', 'storm', 'of', 'the', '2008', 'Atlantic', 'hurricane', 'season', '.', 'developed', 'out', 'of', 'a', 'strong', 'tropical']


In [10]:
from sklearn.model_selection import train_test_split

# Split into training and test sets
train_tokens, test_tokens = train_test_split(
    tokens, test_size=0.2, random_state=42
)

print("Train tokens:", len(train_tokens))
print("Test tokens:", len(test_tokens))

Train tokens: 151509
Test tokens: 37878


In [11]:
train_text = " ".join(train_tokens)
test_text = " ".join(test_tokens)

In [12]:
def generate_ngrams(tokens, n=3):
    X, y = [], []
    for i in range(len(tokens) - n):
        context = " ".join(tokens[i:i+n-1])  # Context (X)
        next_word = tokens[i+n-1]            # Target word (y)
        X.append(context)
        y.append(next_word)
    return X, y

X_seq, y_seq = generate_ngrams(tokens, n=3)


In [13]:
# Reduce dataset size
limit = 20000
X_seq = X_seq[:limit]
y_seq = y_seq[:limit]

# Use smaller TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=3000)
X_vec = vectorizer.fit_transform(X_seq)

# Smaller model
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(64,), max_iter=200)
mlp.fit(X_vec, y_seq)




0,1,2
,hidden_layer_sizes,"(64,)"
,activation,'relu'
,solver,'adam'
,alpha,0.0001
,batch_size,'auto'
,learning_rate,'constant'
,learning_rate_init,0.001
,power_t,0.5
,max_iter,200
,shuffle,True


In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(max_features=3000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [17]:
mlp = MLPClassifier(hidden_layer_sizes=(64,), max_iter=200)
mlp.fit(X_train_vec, y_train)



0,1,2
,hidden_layer_sizes,"(64,)"
,activation,'relu'
,solver,'adam'
,alpha,0.0001
,batch_size,'auto'
,learning_rate,'constant'
,learning_rate_init,0.001
,power_t,0.5
,max_iter,200
,shuffle,True


In [18]:
sample_inputs = X_test[:10]
sample_vectors = vectorizer.transform(sample_inputs)
sample_preds = mlp.predict(sample_vectors)

for i in range(10):
    print(f"🧠 Input: '{sample_inputs[i]}' ➜ Predicted: '{sample_preds[i]}' | Actual: '{y_test[i]}'")

🧠 Input: 'commanded by' ➜ Predicted: 'Prince' | Actual: '.'
🧠 Input: 'was given' ➜ Predicted: 'the' | Actual: 'redshirt'
🧠 Input: 'driven back' ➜ Predicted: 'back' | Actual: 'at'
🧠 Input: 'overall effects' ➜ Predicted: 'is' | Actual: 'were'
🧠 Input: 'the Washington' ➜ Predicted: 'Street' | Actual: 'Crossing'
🧠 Input: 'north to' ➜ Predicted: 'U.S.' | Actual: '.'
🧠 Input: 'to the' ➜ Predicted: 'north' | Actual: 'fort'
🧠 Input: 'while also' ➜ Predicted: 'his' | Actual: 'being'
🧠 Input: 'This proposal' ➜ Predicted: 'never' | Actual: 'was'
🧠 Input: 'greater .' ➜ Predicted: 'and' | Actual: 'However'


In [None]:
def chatbot_response(user_input):
    if len(user_input.strip().split()) < 2:
        return "Please provide at least 2 words."
    
    input_vec = vectorizer.transform([user_input.lower()])
    prediction = mlp.predict(input_vec)[0]
    return f"{user_input} {prediction}"

# Start chatting
print("🤖 Chatbot ready! Type 'exit' to quit.")

while True:
    msg = input("You: ").strip()
    if msg.lower() in ["exit", "quit"]:
        print("Bot: Bye, brother! 👋")
        break
    print("Bot:", chatbot_response(msg))


🤖 Chatbot ready! Type 'exit' to quit.
Bot: Please provide at least 2 words.
Bot: Please provide at least 2 words.
Bot: Hello how are you A
Bot: Can you help me see
Bot: Working on a on
Bot: All night the
Bot: All night the
Bot: Please provide at least 2 words.
Bot: Please provide at least 2 words.
Bot: Please provide at least 2 words.
Bot: Please provide at least 2 words.
