In [31]:
# tasks:
# import global warming dataset(csv)
# select "tweet, existence, existence.confidence" columns
# filter "tweet" by climate change deniers, include "no" and "N" existence
# filter by classification confidence > 0.5
# clean up the tweets, remove unknown characters, remove [link] string
# export as txt file

In [32]:
import pandas

In [33]:
data = pandas.read_csv("data/tweet_global_warming.csv")

In [34]:
# remove unwanted columns by selecting only "tweet, existence, existence.confidence" columns
tweets = data.loc[:, "tweet":"existence.confidence"]

# remove rows where existence is a string longer than 3 characters
# existence row longer than 3 strings suggest that the row is broken data
mask0 = tweets["existence"].str.len() < 4
tweets = tweets[mask0] 

# the dataset has 2 words for negative sentiment, N and No
# replace N with No to make sure that every negative sentiment row has existence == "No"
tweets["existence"] = tweets["existence"].replace("N", "No", regex=False)


In [35]:
# select only rows with negative tweets
mask1 = tweets["existence"] == "No"
negative_tweets = tweets[mask1] 

# select only rows where existence.confidence is higher than 0.5
# using astype(float) on negative_tweets dataframe turns strings of "existence.confidence" column
# into numbers, float means floating point number, e.g 0.37
mask2 = negative_tweets["existence.confidence"].astype(float) > 0.5
negative_tweets = negative_tweets[mask2]

# keep only the tweet column
# this operation will transform pandas dataframe into pandas series
negative_tweets = negative_tweets["tweet"]

In [36]:
# remove unwanted characters and strings from the tweets
clean_tweets = negative_tweets.str.replace("[link]", "", regex=False)
clean_tweets = clean_tweets.str.replace("�", "", regex=False)

In [37]:
clean_tweets.to_csv("data/clean_tweets.txt", header=None, index=None,  sep='\n')