<a href="https://colab.research.google.com/github/Pushkar-Bhuse/PrisonSystem/blob/master/PoliticalClustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
pip install mpld3




In [0]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mplid3

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, words, wordnet
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity

In [0]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [0]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
downloaded = drive.CreateFile({'id':"1QeC_cCMizrD7aG8VDdJpeS80dWLTja5N"})   # replace the id with id of file you want to access
downloaded.GetContentFile('Political.csv') 

In [0]:
downloaded = drive.CreateFile({'id':"1W2tkrxHTSdZFs32KUznyeuMJKQ4Q1PXS"})   # replace the id with id of file you want to access
downloaded.GetContentFile('nonPolitical.csv')  

In [0]:
political_data = pd.read_csv("Political.csv")
nonpolitical_data = pd.read_csv("nonPolitical.csv")

political_data.head()

Unnamed: 0.1,Unnamed: 0,Tweets,label
0,0,"Since the beginning of the outbreak, retailers...",POLITICAL
1,1,"Out of 5 , Three belongs to #Prabh…Covering# 5...",POLITICAL
2,2,India - Muslims 15%\nMcD going Muslims - 5%\n....,POLITICAL
3,3,The Heritage Garden Project : building self su...,POLITICAL
4,4,It's…RT @Arun2981: People who will travel by t...,POLITICAL


In [0]:
nonpolitical_data = nonpolitical_data.rename(columns={"RT @humanvibration: GAN technology is both unimaginably complex and singularly simple. \n\nTrained a dataset of real inputs, like faces, the…":"Tweets",
                                  "NONE": "label"})

In [0]:
train_data = pd.concat([political_data, nonpolitical_data], ignore_index = True)
train_data = train_data.reindex(np.random.permutation(train_data.index))
train_data = train_data.reset_index()
train_data = train_data.drop(columns=['index', 'Unnamed: 0'])

In [0]:
train_data['label'] = train_data['label'].map(lambda x : 1 if x=='POLITICAL' else 0)

In [0]:
stopwords = nltk.corpus.stopwords.words('english')

In [0]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [0]:
HAPPY_EMO = r" ([xX;:]-?[dD)]|:-?[\)]|[;:][pP]) "
SAD_EMO = r" (:'?[/|\(]) "

In [0]:
def lemmatizing_preprocess(text):

  lemmatizer = WordNetLemmatizer()
  tokenized = word_tokenize(text)

  fin = [lemmatizer.lemmatize(t) for t in tokenized]
  return " ".join(fin)

In [0]:
replacement_patterns = [
(r'won\'t', 'will not'),
(r'can\'t', 'can not'),
(r'i\'m', 'i am'),
(r'ain\'t', 'is not'),
(r'(\w+)\'ll', '\g<1> will'),
(r'(\w+)n\'t', '\g<1> not'),
(r'(\w+)\'ve', '\g<1> have'),
(r'(\w+)\'s', '\g<1> is'),
(r'(\w+)\'re', '\g<1> are'),
(r'(\w+)\'d', '\g<1> would')
]

class RegexpReplacer(object):

   def __init__(self, patterns=replacement_patterns):
     self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]

   def replace(self, text):
     s = text
     for (pattern, repl) in self.patterns:
       s = re.sub(pattern, repl, s)

     return s


class RepeatReplacer(object):
    def __init__(self):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'
    def replace(self, word):
        if wordnet.synsets(word):
            return word
        repl_word = self.repeat_regexp.sub(self.repl, word)
        if repl_word != word:
            return self.replace(repl_word)
        else:
            return repl_word



In [0]:
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [0]:
import string

def remove_unknown(text):
  printable = set(string.printable)
  return ''.join(filter(lambda x: x in printable, text))

seperate_words = RegexpReplacer()

repeated_letters = RepeatReplacer()

train_data["Tweets"] = train_data["Tweets"].str.replace(r"[-\.\n]", "")
# Removing HTML garbage
train_data["Tweets"] = train_data["Tweets"].str.replace(r"&\w+;", "")
# Removing links
train_data["Tweets"] = train_data["Tweets"].str.replace(r"https?://\S*", "")

# mark emoticons as happy or sad
train_data["Tweets"] = train_data["Tweets"].str.replace(HAPPY_EMO, " happy ")
train_data["Tweets"] = train_data["Tweets"].str.replace(SAD_EMO, " sad ")
train_data["Tweets"] = train_data["Tweets"].str.lower()

train_data["Tweets"] = train_data["Tweets"].apply(remove_unknown)
train_data["Tweets"] = train_data["Tweets"].apply(lambda x : seperate_words.replace(x))
train_data["Tweets"] = train_data["Tweets"].apply(lambda x : repeated_letters.replace(x))

train_data["Tweets"] = train_data["Tweets"].str.translate(str.maketrans('', '', string.punctuation))

#Handling tags, In this case we remove all the tags
train_data["Tweets"] = train_data["Tweets"].str.replace(r"@[a-zA-Z0-9_]* ", "")

In [0]:
train_data["Tweets"] = train_data["Tweets"].apply(lemmatizing_preprocess)

In [0]:
train_data.head()

Unnamed: 0,Tweets,label
0,rt mathurvaishali my coleague shahbaz who desi...,0
1,whereisosha this story is so pencedemic so sti...,0
2,space nasa ufo spacex spacexploration mondayth...,1
3,barbados matwhirpol now we realy have to go ba...,0
4,like for surjewala,1


In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_tokenizer(text):
  return word_tokenize(text)

tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, use_idf=True, tokenizer=tfidf_tokenizer, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(train_data["Tweets"].tolist())

print(tfidf_matrix.shape)

CPU times: user 30.6 s, sys: 340 ms, total: 30.9 s
Wall time: 30.9 s
(75148, 10)


In [0]:
terms = tfidf_vectorizer.get_feature_names()

In [0]:

dist = 1 - cosine_similarity(tfidf_matrix)

#Option 1: KNN Clustering 

In [0]:
from sklearn.cluster import KMeans

num_clusters = 2

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

CPU times: user 28.5 s, sys: 84.8 ms, total: 28.5 s
Wall time: 28.6 s


In [0]:
import pandas as pd

data = { 'label': train_data['label'], 'Tweets': train_data['Tweets'], 'cluster': clusters }

frame = pd.DataFrame(data, index = [clusters] , columns = ['label', 'Tweets', 'cluster'])

In [0]:
frame['cluster'].value_counts()

0    38010
1    37138
Name: cluster, dtype: int64

In [0]:
import string
def strip_proppers(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent) if word.islower()]
    return "".join([" "+i if not i.startswith("'") and i not in string.punctuation else i for i in tokens]).strip()

In [0]:
#strip any proper nouns (NNP) or plural proper nouns (NNPS) from a text
from nltk.tag import pos_tag

def strip_proppers_POS(text):
    tagged = pos_tag(text.split()) #use NLTK's part of speech tagger
    non_propernouns = [word for word,pos in tagged if pos != 'NNP' and pos != 'NNPS']
    return non_propernouns

#Option 2: Latent Dirichlet Allocation


In [0]:
#Latent Dirichlet Allocation implementation with Gensim

from gensim import corpora, models, similarities 

#remove proper names
preprocess = [strip_proppers(doc) for doc in train_data['Tweets']]

%time tokenized_text = [tfidf_tokenizer(lemmatizing_preprocess(text)) for text in preprocess]

%time texts = [[word for word in text if word not in stopwords] for text in tokenized_text]

CPU times: user 41.5 s, sys: 280 ms, total: 41.8 s
Wall time: 41.8 s
CPU times: user 4.97 s, sys: 13 ms, total: 4.98 s
Wall time: 4.98 s


In [0]:
dictionary = corpora.Dictionary(texts)

In [0]:
dictionary.filter_extremes(no_below=1, no_above=0.8)

In [0]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [0]:
%time lda = models.LdaModel(corpus, num_topics=2, id2word=dictionary, update_every=5, chunksize=10000, passes=100)

In [0]:
print(lda[corpus[0]])

In [0]:
topics = lda.print_topics(2, num_words=20)

In [0]:
topics_matrix = lda.show_topics(formatted=False, num_words=20)

In [0]:
topics_matrix = np.array(topics_matrix)

In [0]:
topic_words = topics_matrix[:,:,1]

In [0]:
for i in topic_words:
    print([str(word) for word in i])
    print()