In [44]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd
import json
import nltk
import re
import string
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from tqdm.notebook import tqdm
!python -m nltk.downloader all
!pip install transformers
from transformers import pipeline

In [None]:
with open('/content/drive/MyDrive/tweets.json') as jfile:
    tweets = json.load(jfile)

In [None]:
print(f"Number of tweets in json file : {len(tweets.keys())}")

Number of tweets in json file : 43347


In [None]:
#convert json file to pandas dataframe
tweet_df = pd.DataFrame(columns=['tweet author','tweet text'])

#append tweets in tweet_df dataframe
for key in tqdm(tweets.keys()):
    tweet_author = tweets[key]["tweet_author"]
    tweet_text = tweets[key]["tweet_text"]
    tweet_df = tweet_df.append({'tweet author':tweet_author,"tweet text":tweet_text}, ignore_index=True)
    
tweet_df.head()

HBox(children=(FloatProgress(value=0.0, max=43347.0), HTML(value='')))




Unnamed: 0,tweet author,tweet text
0,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...
1,"Michael Wang, MD",This phase 2 Acalabrutinib-Venetoclax (AV) tri...
2,1stOncology,#NICE backs #AstraZenecas #Calquence for #CLL ...
3,Toby Eyre,#acalabrutinib is a valuable option in pts int...
4,Lymphoma Hub,NICE has recommended the use of acalabrutinib ...


In [None]:
new_tweet_df = tweet_df.copy()

In [None]:
print(f"Total number of tweets : {len(tweet_df)}")

Total number of tweets : 43347


In [None]:
#drop duplicate tweets
tweet_df = tweet_df.drop_duplicates().reset_index()

In [None]:
print(f"Total number of unique tweets : {len(tweet_df)}")

Total number of unique tweets : 41818


In [None]:
unique_authors = len(tweet_df["tweet author"].unique())
print(f"There are {unique_authors} unique authors in the json file.")

There are 9292 unique authors in the json file.


In [None]:
#print 5 sample tweets
for i in range(0,5):
    tweet = tweet_df["tweet text"][i]
    print(f"Tweet {i +1 } : {tweet} \n")

Tweet 1 : ⚕️ Scientists conducted a Phase II study of acalabrutinib in patients with relapsed/refractory #CLL who were ibrutinib-intolerant, and found an overall response rate of 73%. 
https://t.co/eJ6m4QpC5P https://t.co/kuZz6ZO47r 

Tweet 2 : This phase 2 Acalabrutinib-Venetoclax (AV) trial that is still in recruitment phase will study how well venetoclax and acalabrutinib works in MCL patients who either relapsed or non-respondent to the initial therapy.

https://t.co/gg0G9At23N 

Tweet 3 : #NICE backs #AstraZenecas #Calquence for #CLL https://t.co/Vb5lPDoGrA 

Tweet 4 : #acalabrutinib is a valuable option in pts intolerant to #ibrutinib. Further valuable data to help decision making in #CLL 

Early View | Haematologica https://t.co/Z2kCLZaX0D 

Tweet 5 : NICE has recommended the use of acalabrutinib for patients with treatment-naïve chronic lymphocytic leukemia. Find out more here https://t.co/6OuJptLCIN #lymsm #lymphoma 



## Data Cleaning Pipeline

In [None]:
#remove links from the tweets
tweet_df["tweet text"] = tweet_df["tweet text"].apply(lambda tweet : re.sub(r"https\S+","",tweet))

In [None]:
#remove hashtags,mentions from the tweets
tweet_df["tweet text"] = tweet_df["tweet text"].apply(lambda tweet : re.sub(r"[\@\#\$]\w+","",tweet))

In [None]:
#lowercase the words of tweets
tweet_df["tweet text"] = tweet_df["tweet text"].apply(lambda tweet : tweet.lower())

In [None]:
stop_words = set(stopwords.words('english')) 
def remove_stop_word(tweet):
    """Return tweet after removing stopwords"""
    tokenized_tweet = word_tokenize(tweet)
    filtered_tweet = [word for word in tokenized_tweet if word not in stop_words]
    return " ".join(filtered_tweet)

#apply remove_stop_word on tweets
tweet_df["tweet text"] = tweet_df["tweet text"].apply(remove_stop_word)

In [None]:
#remove punctuation from the tweets
tweet_df["tweet text"] = tweet_df["tweet text"].apply(lambda tweet : re.sub('[%s]' % re.escape(string.punctuation)," ",tweet))

In [None]:
#remove all tokens that are not alphabetic
def filter_tweets(tweet):
    """Fiter out the non alphanumeric words"""
    splitted_text = word_tokenize(tweet)
    filtered_tweet = [word for word in splitted_text if word.isalpha()]
    return " ".join(filtered_tweet)

#remove punctuation from the tweets
tweet_df["tweet text"] = tweet_df["tweet text"].apply(filter_tweets)

In [None]:
tweets_data = tweet_df["tweet text"]

In [None]:
#Drop duplicate tweets 
tweets_data = tweets_data.drop_duplicates().reset_index(drop=True)

In [None]:
print(f"Number of unique tweets : {len(tweets_data)}")

Number of unique tweets : 31840


## Finding Entities

In [None]:
combined_tweets = ''
for tweet in tweets_data:
    combined_tweets = combined_tweets +" "+ tweet

In [55]:
#find most frequently occured trigrams
trigram_fd = nltk.FreqDist(nltk.trigrams(combined_tweets.split()))

trigrams = []
trigrams_freq = []

#top 50 most frequently used trigrams
for val in trigram_fd.most_common(n=50):
    if len(val[0][0])>1 and len(val[0][1])>1 and len(val[0][2])>1:
      trigram = " ".join(val[0])
      if "http" not in trigram and "article" not in trigram:
        trigrams.append(trigram)
        trigrams_freq.append(val[1])

pd.DataFrame({"Trigram":trigrams,"Frequency":trigrams_freq})

In [63]:
#find most frequently occured bigrams
bigram_fd = nltk.FreqDist(nltk.bigrams(combined_tweets.split()))

bigrams = []
bigrams_freq = []

#top 100 most frequently used bigrams
for val in bigram_fd.most_common(n=100):
    if len(val[0][0])>2 and len(val[0][1])>2:
      bigram = " ".join(val[0])
      if "http" not in bigram and "article" not in bigram:
        bigrams.append(bigram)
        bigrams_freq.append(val[1])

pd.DataFrame({"Bigram":bigrams,"Frequency":bigrams_freq})

Unnamed: 0,Bigram,Frequency
0,chronic lymphocytic,11519
1,lymphocytic leukemia,9669
2,patients chronic,1481
3,leukemia cll,1181
4,relapsed refractory,1015
...,...,...
76,disease progression,121
77,ibrutinib plus,121
78,calquence acalabrutinib,120
79,cll chronic,120


In [83]:
#frequently occuring entities in tweets
entities = bigrams + trigrams
entities

['chronic lymphocytic',
 'lymphocytic leukemia',
 'patients chronic',
 'leukemia cll',
 'relapsed refractory',
 'refractory chronic',
 'small lymphocytic',
 'treatment chronic',
 'cell lymphoma',
 'mantle cell',
 'lymphocytic lymphoma',
 'patients relapsed',
 'lymphocytic leukaemia',
 'clinical trial',
 'leukemia small',
 'leukemia patients',
 'high risk',
 'leukemia new',
 'long term',
 'first line',
 'treatment patients',
 'previously untreated',
 'non hodgkin',
 'cll patients',
 'btk inhibitor',
 'fda approves',
 'blood cancer',
 'phase iii',
 'clinical trials',
 'relapsed chronic',
 'hodgkin lymphoma',
 'btk inhibitors',
 'treatment options',
 'cell therapy',
 'cancer drug',
 'untreated chronic',
 'lymphoma chronic',
 'phase study',
 'line treatment',
 'leukemia chronic',
 'patients cll',
 'leukemia via',
 'cell chronic',
 'residual disease',
 'leukemia cells',
 'new drug',
 'cll sll',
 'previously treated',
 'diagnosed chronic',
 'minimal residual',
 'progression free',
 'free sur

In [81]:
tweet_df = tweet_df.drop(["index"],axis=1)

## Analysing sentiment of tweets using hugging face transformers

In [None]:
sentiment_analysis = pipeline("sentiment-analysis")

In [None]:
#predicting sentiment for each tweet
tweet_sentiments = []
sentiment_scores = []

for tweet in tqdm(tweet_df["tweet text"]):
    sentiment_result = sentiment_analysis(tweet)[0]
    tweet_sentiments.append(sentiment_result["label"])
    sentiment_scores.append(sentiment_result["score"])

pd.DataFrame({"Sentiment":tweet_sentiments,"Score":sentiment_scores}).to_csv("/content/drive/MyDrive/tweet_sentiments.csv",index=False)

In [102]:
sentiment_df = pd.read_csv("/content/drive/MyDrive/tweet_sentiments.csv")
sentiment_df.head()

Unnamed: 0,Sentiment,Score
0,NEGATIVE,0.936341
1,NEGATIVE,0.9662
2,NEGATIVE,0.9677
3,POSITIVE,0.971031
4,POSITIVE,0.674825


In [104]:
tweet_df["sentiment"] = sentiment_df["Sentiment"]

## Aanlysing sentiments of authors for different etities

In [148]:
final_entities_dict = {"author":[]}
for entity in entities:
  final_entities_dict[entity] = []
for author in tqdm(tweet_df["tweet author"].unique()):
    sub_df = tweet_df[tweet_df["tweet author"]==author]
    entities_dict = dict()
    for i in range(len(sub_df)):
        tweet = sub_df.iloc[i]
        try:
          tweet_bigrams = [" ".join(bigram) for bigram in nltk.bigrams(tweet["tweet text"].split())]
          tweet_trigrams = [" ".join(trigram) for trigram in nltk.trigrams(tweet["tweet text"].split())]
        except:
          #print(tweet["tweet text"])
          pass
        keywords = set(tweet_bigrams+tweet_trigrams)
        entities_mateched = set(keywords)&set(entities)
        for entity in entities_mateched:
            if entity not in list(entities_dict.keys()):
              entities_dict[entity] = [tweet["sentiment"]]
            else:
              entities_dict[entity].append(tweet["sentiment"])
    for entity in list(entities_dict.keys()):
      if "NEGATIVE" in entities_dict[entity] and "POSITIVE" in entities_dict[entity]:
        pos_count = entities_dict[entity].count("POSITIVE")
        neg_count = entities_dict[entity].count("NEGATIVE")
        if pos_count>neg_count:
          entities_dict[entity] = "POSITIVE"
        else:
          entities_dict[entity] = "NEGATIVE"
      else:
         entities_dict[entity] =  entities_dict[entity][0]
    for key in final_entities_dict:
      if key == "author":
        final_entities_dict[key].append(author)
      elif key in entities_dict.keys():
        final_entities_dict[key].append(entities_dict[key])
      else:
        final_entities_dict[key].append(None)

HBox(children=(FloatProgress(value=0.0, max=9292.0), HTML(value='')))




In [149]:
author_sentiment_df = pd.DataFrame(final_entities_dict)
author_sentiment_df.head()

Unnamed: 0,author,chronic lymphocytic,lymphocytic leukemia,patients chronic,leukemia cll,relapsed refractory,refractory chronic,small lymphocytic,treatment chronic,cell lymphoma,mantle cell,lymphocytic lymphoma,patients relapsed,lymphocytic leukaemia,clinical trial,leukemia small,leukemia patients,high risk,leukemia new,long term,first line,treatment patients,previously untreated,non hodgkin,cll patients,btk inhibitor,fda approves,blood cancer,phase iii,clinical trials,relapsed chronic,hodgkin lymphoma,btk inhibitors,treatment options,cell therapy,cancer drug,untreated chronic,lymphoma chronic,phase study,line treatment,...,chronic lymphocytic leukemia,patients chronic lymphocytic,lymphocytic leukemia cll,refractory chronic lymphocytic,relapsed refractory chronic,treatment chronic lymphocytic,small lymphocytic lymphoma,chronic lymphocytic leukaemia,mantle cell lymphoma,lymphocytic leukemia small,leukemia small lymphocytic,patients relapsed refractory,lymphocytic leukemia patients,lymphocytic leukemia new,relapsed chronic lymphocytic,untreated chronic lymphocytic,lymphoma chronic lymphocytic,non hodgkin lymphoma,lymphocytic leukemia via,cell chronic lymphocytic,lymphocytic leukemia cells,treatment patients chronic,leukemia chronic lymphocytic,diagnosed chronic lymphocytic,minimal residual disease,first line treatment,new clinical trial,therapy chronic lymphocytic,previously untreated chronic,lymphocytic leukemia treatment,progression free survival,chronic lymphocytic cll,patient chronic lymphocytic,risk chronic lymphocytic,lymphocytic leukemia chronic,cll chronic lymphocytic,ibrutinib chronic lymphocytic,breakthrough therapy designation,bruton tyrosine kinase,high risk chronic
0,Hematopoiesis News,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE,,NEGATIVE,NEGATIVE,NEGATIVE,,NEGATIVE,NEGATIVE,NEGATIVE,POSITIVE,,,NEGATIVE,NEGATIVE,,,NEGATIVE,,NEGATIVE,,NEGATIVE,,,,,,,,,,NEGATIVE,POSITIVE,...,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE,,,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE,,,,,,,NEGATIVE,,,,,,POSITIVE,,NEGATIVE,,,,,,POSITIVE,,,,NEGATIVE,,POSITIVE
1,"Michael Wang, MD",POSITIVE,POSITIVE,,POSITIVE,NEGATIVE,,POSITIVE,,POSITIVE,POSITIVE,POSITIVE,NEGATIVE,,NEGATIVE,POSITIVE,,,,POSITIVE,,,,,POSITIVE,NEGATIVE,,,,,,,NEGATIVE,,,,,,POSITIVE,,...,POSITIVE,,POSITIVE,,,,POSITIVE,,POSITIVE,POSITIVE,POSITIVE,NEGATIVE,,,,,,,,,,,,,,,,,,,,,,,,,,,POSITIVE,
2,1stOncology,NEGATIVE,NEGATIVE,NEGATIVE,,NEGATIVE,,NEGATIVE,NEGATIVE,,,NEGATIVE,POSITIVE,,,NEGATIVE,,NEGATIVE,,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE,,NEGATIVE,,NEGATIVE,NEGATIVE,NEGATIVE,,NEGATIVE,,,,,,NEGATIVE,,NEGATIVE,NEGATIVE,...,NEGATIVE,NEGATIVE,,,,NEGATIVE,NEGATIVE,,,NEGATIVE,NEGATIVE,NEGATIVE,,,NEGATIVE,,,,,,,NEGATIVE,,,,NEGATIVE,,,NEGATIVE,,NEGATIVE,,,,,,,,,
3,Toby Eyre,NEGATIVE,NEGATIVE,NEGATIVE,,NEGATIVE,,,POSITIVE,NEGATIVE,NEGATIVE,,,POSITIVE,NEGATIVE,,POSITIVE,NEGATIVE,,NEGATIVE,NEGATIVE,,,,NEGATIVE,NEGATIVE,,NEGATIVE,POSITIVE,NEGATIVE,NEGATIVE,,NEGATIVE,,,,,,POSITIVE,,...,NEGATIVE,NEGATIVE,,,,POSITIVE,,POSITIVE,NEGATIVE,,,,POSITIVE,,NEGATIVE,,,,,NEGATIVE,,,,,POSITIVE,,,,,,,,,NEGATIVE,,,,,,NEGATIVE
4,Lymphoma Hub,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE,,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE,,POSITIVE,,NEGATIVE,NEGATIVE,POSITIVE,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE,,,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE,POSITIVE,POSITIVE,NEGATIVE,NEGATIVE,POSITIVE,NEGATIVE,NEGATIVE,NEGATIVE,...,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE,,NEGATIVE,,NEGATIVE,NEGATIVE,NEGATIVE,,,NEGATIVE,POSITIVE,NEGATIVE,NEGATIVE,,,,POSITIVE,,,NEGATIVE,NEGATIVE,,NEGATIVE,POSITIVE,,,,,,,,,NEGATIVE,,


In [150]:
author_sentiment_df.to_csv("nlp_assignment_submission_pranjal_soni.csv",index=False)