In [8]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import math
import joblib
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\srmpc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
df = pd.read_csv("./data/language_model.csv", encoding="ISO-8859-1", header=None)

#Sentiment140 dataset with 1.6 million tweets from kaggle

In [3]:
df.columns = ['label', 'id', 'date', 'query', 'user', 'text']

df.head()

Unnamed: 0,label,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
#Removing We will remove:
# URLs
# Mentions (@username)
# Hashtags symbol (# but keep word)
# Emojis (optional â€” keep if you want)
# Special characters
# Extra spaces


def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)   # remove URLs
    text = re.sub(r"@\w+", "", text)            # remove mentions
    text = re.sub(r"[^a-zA-Z\s]", "", text)     # remove punctuation/emojis
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["clean"] = df["text"].apply(clean_text)
df.head()

Unnamed: 0,label,id,date,query,user,text,clean
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",a thats a bummer you shoulda got david carr of...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,is upset that he cant update his facebook by t...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball managed to sav...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",no its not behaving at all im mad why am i her...


In [5]:
df.columns

Index(['label', 'id', 'date', 'query', 'user', 'text', 'clean'], dtype='object')

In [9]:
df = df.drop(columns=['label', 'id', 'date', 'query', 'user', 'text'])

In [10]:
tokens = []

for text in df['clean']:
        tokens.extend(word_tokenize(text))

len(tokens)


19855818

In [11]:
bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens)-1)]

word_count = Counter(tokens)
bigram_count = Counter(bigrams)

vocab = len(word_count)

In [12]:
def bigram_prob(w1, w2):
    return (bigram_count[(w1, w2)] + 1) / (word_count[w1] + vocab)

In [19]:
bigram_probabilities = {}

for (w1, w2), count in bigram_count.items():
    prob = (count + 1) / (word_count[w1] + vocab)
    bigram_probabilities[(w1, w2)] = prob

In [13]:
def predict_next_word(word, top_k=5):
    candidates = []
    
    for w in word_count:
        prob = bigram_prob(word, w)
        candidates.append((w, prob))
    
    return sorted(candidates, key=lambda x: x[1], reverse=True)[:top_k]


In [14]:
print(predict_next_word("i"))


[('have', 0.0376550109333965), ('am', 0.02957874995831028), ('dont', 0.027423712086940385), ('love', 0.024797473474647946), ('was', 0.024215100145122986)]


In [15]:
def generate_tweet(seed, max_len=12):
    words = [seed.lower()]
    
    for _ in range(max_len):
        next_word = predict_next_word(words[-1], top_k = 1)[0][0]
        words.append(next_word)
        
        if next_word in ['!', ".", "?"]:
            break
    return " ".join(words)

In [16]:
print(generate_tweet("i"))


i have to be a good morning i have to be a good


In [17]:
def perplexity(test_tokens):
    total_log_prob = 0
    N = len(test_tokens)
    
    for i in range(1, N):
        prob = bigram_prob(test_tokens[i-1], test_tokens[i])
        total_log_prob += math.log(prob)
    
    return math.exp(-total_log_prob / N)

test_text = word_tokenize("i love coding and learning new things")
print("Perplexity:", perplexity(test_text))


Perplexity: 2686.353591359289


In [20]:
joblib.dump(bigram_probabilities, "bigram_probs.pkl")
joblib.dump(bigram_count, "bigram_counts.pkl")


['bigram_counts.pkl']