# Scrapping Tweets

Initializing config files.

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
ls

[0m[01;34mdrive[0m/  [01;34msample_data[0m/


In [3]:
#cd drive/MyDrive/temp/Sentiment_Analysis

In [4]:
cd /content/drive/My Drive/Sentiment_Analysis

/content/drive/My Drive/Sentiment_Analysis


In [5]:
import configparser
import tweepy as tw
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize
import nltk
import numpy as np



In [6]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('names')
nltk.download('movie_reviews')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Unzipping corpora/names.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [7]:
config = configparser.RawConfigParser()
config.read(filenames='twitter.properties')
print(config.sections())

['twitter']


Creating 4 variables and assigning them basically saying read there 4 keys from file and assign.

In [8]:
api_key             = config.get('twitter','apikey')
api_secret_key      = config.get('twitter','apisecretkey')
access_token        = config.get('twitter','accesstoken')
access_token_secret = config.get('twitter','accesstokensecret')

Calling OAuthHandler required for authentication with Twitter.

In [9]:
auth = tw.OAuthHandler(api_key, api_secret_key)
auth.set_access_token(access_token, access_token_secret)

Calling tweepy API func. wait_on_rate_limit is required because twitter has set a limit.

In [12]:
api  = tw.API(auth,wait_on_rate_limit=True)

In [13]:
search_term = "#oxygen"
date_since  = "2021-03-01"

Creating a tweepy itemiterator tweets for searching by calling tw.Curser func, pulling only 10000 records.

In [14]:
tweets = tw.Cursor(api.search,
                   q = search_term,
                   lang = 'en',
                   since = date_since).items(1000)
                   
tweet_details = [[tweet.text,tweet.user.screen_name,tweet.user.location] for tweet in tweets]

Converting to DataFrame

In [15]:
tweet_df = pd.DataFrame(data = tweet_details, columns = ["text","user","location"])
pd.set_option("max_colwidth",800)
#tweet_df.head(20)
tweet_df.rename(columns={'text': 'tweet'}, inplace=True)

In [16]:
tweet_df.head()

Unnamed: 0,tweet,user,location
0,RT @mkheatsinks: Oxygen Concentrator with 3 years warranty advertisement in today's HT #mustread #OxygenEmergency #Oxygen #concentrator #Co…,SonnuSood,"Mumbai, India"
1,Y’all thought it was over? Nope. New season of the “Bad Girls Club” on the @thezeusnetwork so suscribe!!!… https://t.co/HQ6cpQKT95,ALOHA_B0Y,"Chicago, Il/ EwaBeach Hawai'i"
2,RT @Bikuengr: #COVIDEmergencyIndia \n#COVIDEmergency \n\nRanchi Oxygen Cylinder refill available.\n\n#Verified \n#Ranchi #Oxygen \n#oxygenrefill\n#…,SonnuSood,"Mumbai, India"
3,RT @switchindiaorg: We will be creating a thread for sharing consolidated and verified information about #Mumbai #HospitalBed #Ventilator #…,BabliKu23133587,
4,"RT @PIB_India: Health Experts on #COVID19: \n\n◾With oxygen saturation 93-94%, there's no need to really take high inflow oxygen\n◾Taking oxyg…",debasish_224,India


Make convert tweets in lowercase.

In [17]:
tweet_df['tweet'] = tweet_df['tweet'].str.lower()

# Merging to in csv

In [18]:
old_df = pd.read_csv("Oxygen.csv")
tweet_df = pd.concat([old_df, tweet_df],ignore_index=True)
tweet_df = tweet_df.drop("Unnamed: 0",axis=1)
tweet_df.to_csv("Oxygen.csv")

# Adding labels to dataframe

In [19]:
unwanted = nltk.corpus.stopwords.words("english")
unwanted.extend([w.lower() for w in nltk.corpus.names.words()])

def skip_unwanted(pos_tuple):
    word, tag = pos_tuple
    if not word.isalpha() or word in unwanted:
        return False
    if tag.startswith("NN"):
        return False
    return True

positive_words = [word for word, tag in filter(
    skip_unwanted,
    nltk.pos_tag(nltk.corpus.movie_reviews.words(categories=["pos"]))
)]
negative_words = [word for word, tag in filter(
    skip_unwanted,
    nltk.pos_tag(nltk.corpus.movie_reviews.words(categories=["neg"]))
)]

In [20]:
sid = SentimentIntensityAnalyzer()

In [21]:
sid.polarity_scores("She is gone")

{'compound': 0.0, 'neg': 0.0, 'neu': 1.0, 'pos': 0.0}

In [22]:
def label(sentence):
  L = sid.polarity_scores(sentence)
  return int(L['compound']<0)

In [23]:
L = []
for i in range(len(tweet_df)):
  L.append(label(tweet_df.tweet[i]))
  
tweet_df['label'] = L

Dividing data into train and test set.

In [24]:
df = pd.read_csv("Oxygen.csv")
tweet_df['split'] = np.random.randn(tweet_df.shape[0], 1)
msk = np.random.rand(len(tweet_df)) <= 0.7
train = tweet_df[msk]
test = tweet_df[~msk]
train.to_csv("train.csv")
test.to_csv("test.csv")