**Tokenization and stemming**

In [0]:

import pandas as pd
df=pd.read_csv("tweet_data.csv")
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk

In [0]:

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [0]:

#Preprocessing
import json
import re
from textblob import TextBlob
import string
from nltk.corpus import stopwords

nltk.download('stopwords')
emoji_pattern = re.compile("["
         u"\U0001F600-\U0001F64F"  # emoticons
         u"\U0001F300-\U0001F5FF"  # symbols & pictographs
         u"\U0001F680-\U0001F6FF"  # transport & map symbols
         u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
         u"\U00002702-\U000027B0"
         u"\U000024C2-\U0001F251"
         "]+", flags=re.UNICODE)

def clean_tweets(tweet):
  ps = PorterStemmer()
  stop_words = set(stopwords.words('english'))
  word_tokens = word_tokenize(tweet)
  tweet = re.sub(r':', '', tweet)
  tweet = re.sub(r'‚Ä¶', '', tweet)
  tweet = re.sub(r'[^\x00-\x7F]+',' ', tweet)
  tweet = emoji_pattern.sub(r'', tweet)
  filtered_tweet = [ps.stem(w) for w in word_tokens if not w in stop_words and w not in string.punctuation]
  return ' '.join(filtered_tweet)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
cleaned_tweets = []
for i in range(1,df.shape[0]):
  cleaned_tweets.append(clean_tweets(df.at[i,"tweet"]))
cleaned_tweets

["b'rt bengalurubul A littl progress day add big results.\\n\\n fullchargemaadi champion vivopkl7 kabaddi vivoprokabaddileagu\\xe2\\x80\\xa6",
 "b'rt jaipurpanth matchday No 8 the panthersquad kick action chennai leg upyoddha\\n.\\n.\\n roarforpanth panthe\\xe2\\x80\\xa6",
 "b '' RT bengalurubul our bullsquad 's defens get better everi pass day defens prove hand o\\xe2\\x80\\xa6 ''",
 "b'rt jaipurpanth onli one day go take upyoddha first encount chennai leg.\\n.\\n.\\n roarforpanth panther\\xe2\\x80\\xa6",
 "b'rt khelkabaddinew congratul thakurkabaddi \\xf0\\x9f\\x98\\x8d\\n arjunaward kabaddi ajaythakur khelkabaddi http //t.co/lnuq1milta",
 "b'thi thalaiva second home game yet tast victori lost previou match tie one\\xe2\\x80\\xa6 http //t.co/grp4n0rvtl",
 "b'rt bengalurubul nammafan nippon paint all-round moment match \\n\\n fullchargemaadi champion vivopkl7 kabaddi \\xe2\\x80\\xa6",
 "b'rt bengalurubul nammafan chosen walkmat fashion moment match \\n\\n fullchargemaadi champion vivo

In [0]:
all_tweets = ' '.join(cleaned_tweets)
all_words = all_tweets.split()
all_words

["b'rt",
 'bengalurubul',
 'A',
 'littl',
 'progress',
 'day',
 'add',
 'big',
 'results.\\n\\n',
 'fullchargemaadi',
 'champion',
 'vivopkl7',
 'kabaddi',
 'vivoprokabaddileagu\\xe2\\x80\\xa6',
 "b'rt",
 'jaipurpanth',
 'matchday',
 'No',
 '8',
 'the',
 'panthersquad',
 'kick',
 'action',
 'chennai',
 'leg',
 'upyoddha\\n.\\n.\\n',
 'roarforpanth',
 'panthe\\xe2\\x80\\xa6',
 'b',
 "''",
 'RT',
 'bengalurubul',
 'our',
 'bullsquad',
 "'s",
 'defens',
 'get',
 'better',
 'everi',
 'pass',
 'day',
 'defens',
 'prove',
 'hand',
 'o\\xe2\\x80\\xa6',
 "''",
 "b'rt",
 'jaipurpanth',
 'onli',
 'one',
 'day',
 'go',
 'take',
 'upyoddha',
 'first',
 'encount',
 'chennai',
 'leg.\\n.\\n.\\n',
 'roarforpanth',
 'panther\\xe2\\x80\\xa6',
 "b'rt",
 'khelkabaddinew',
 'congratul',
 'thakurkabaddi',
 '\\xf0\\x9f\\x98\\x8d\\n',
 'arjunaward',
 'kabaddi',
 'ajaythakur',
 'khelkabaddi',
 'http',
 '//t.co/lnuq1milta',
 "b'thi",
 'thalaiva',
 'second',
 'home',
 'game',
 'yet',
 'tast',
 'victori',
 'lost

In [0]:
from nltk import ngrams


threegrams = ngrams(all_words, 3)

for grams in threegrams:
  print(grams)

("b'rt", 'bengalurubul', 'A')
('bengalurubul', 'A', 'littl')
('A', 'littl', 'progress')
('littl', 'progress', 'day')
('progress', 'day', 'add')
('day', 'add', 'big')
('add', 'big', 'results.\\n\\n')
('big', 'results.\\n\\n', 'fullchargemaadi')
('results.\\n\\n', 'fullchargemaadi', 'champion')
('fullchargemaadi', 'champion', 'vivopkl7')
('champion', 'vivopkl7', 'kabaddi')
('vivopkl7', 'kabaddi', 'vivoprokabaddileagu\\xe2\\x80\\xa6')
('kabaddi', 'vivoprokabaddileagu\\xe2\\x80\\xa6', "b'rt")
('vivoprokabaddileagu\\xe2\\x80\\xa6', "b'rt", 'jaipurpanth')
("b'rt", 'jaipurpanth', 'matchday')
('jaipurpanth', 'matchday', 'No')
('matchday', 'No', '8')
('No', '8', 'the')
('8', 'the', 'panthersquad')
('the', 'panthersquad', 'kick')
('panthersquad', 'kick', 'action')
('kick', 'action', 'chennai')
('action', 'chennai', 'leg')
('chennai', 'leg', 'upyoddha\\n.\\n.\\n')
('leg', 'upyoddha\\n.\\n.\\n', 'roarforpanth')
('upyoddha\\n.\\n.\\n', 'roarforpanth', 'panthe\\xe2\\x80\\xa6')
('roarforpanth', 'pant

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
X = vec.fit_transform(cleaned_tweets)
df_term = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
print(df_term)

    10  2019  29  3o1uysegy2  ...  yesterday  yet  yoddha  zmg4badld4
0    0     0   0           0  ...          0    0       0           0
1    0     0   0           0  ...          0    0       0           0
2    0     0   0           0  ...          0    0       0           0
3    0     0   0           0  ...          0    0       0           0
4    0     0   0           0  ...          0    0       0           0
5    0     0   0           0  ...          0    1       0           0
6    0     0   0           0  ...          0    0       0           0
7    0     0   0           0  ...          0    0       0           0
8    0     0   0           0  ...          0    0       0           0
9    0     0   0           0  ...          0    0       0           0
10   0     0   0           0  ...          0    0       0           0
11   0     0   0           0  ...          0    0       0           0
12   0     0   0           0  ...          0    0       0           0
13   0     0   0    

**Similarity measures**

In [0]:
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return len(s1.intersection(s2)) / len(s1.union(s2))
  
jaccard_similarity(df.at[1,"tweet"].split(),df.at[0,"tweet"].split())

0.0

In [0]:
import re, math
from collections import Counter
WORD = re.compile(r'\w+')

def get_cosine(vec1, vec2):
  intersection = set(vec1.keys()) & set(vec2.keys())
  numerator = sum([vec1[x] * vec2[x] for x in intersection])
  sum1 = sum([vec1[x]**2 for x in vec1.keys()])
  sum2 = sum([vec2[x]**2 for x in vec2.keys()])
  denominator = math.sqrt(sum1) * math.sqrt(sum2)
  if not denominator:
     return 0.0
  else:
     return float(numerator) / denominator
def get_dice(vec1, vec2):
  intersection = set(vec1.keys()) & set(vec2.keys())
  numerator = 2 *sum([vec1[x] * vec2[x] for x in intersection])
  sum1 = sum([vec1[x]**2 for x in vec1.keys()])
  sum2 = sum([vec2[x]**2 for x in vec2.keys()])
  denominator = sum1 + sum2
  if not denominator:
     return 0.0
  else:
     return float(numerator) / denominator
def text_to_vector(text):
  words = WORD.findall(text)
  return Counter(words)

text1 = df.at[1,"tweet"]
text2 = df.at[0,"tweet"]

vector1 = text_to_vector(text1)
vector2 = text_to_vector(text2)

print("cosine",get_cosine(vector1, vector2))
print("dice",get_dice(vector1,vector2))

cosine 0.22283440581246225
dice 0.2222222222222222
