# Cleaning Tweets for Modeling

In [1]:
import pandas as pd
import cleaning_utils as utils

### Step 1: Load twitter data

In [2]:
df = pd.read_csv('abortion_data.csv', usecols=['label', 'location', 'tweet_text'])
df.head()

Unnamed: 0,label,location,tweet_text
0,0,"Milford, CT",50 MILLION unborn children have lost their liv...
1,0,"Atlanta, Georgia",@LiveAction #ProLife #abortion #Parenthood #ba...
2,0,,@NARAL @ProChoiceCA Definitely NOT safe for th...
3,0,,@MichaelKellyIC @1Hildegarde I think she wants...
4,0,,YouTube censors lifesaving abortion informatio...


### Step 2: Remove twitter handles

In [3]:
df_copy_1 = df.copy()
df_copy_1['tweet_text']=df_copy_1['tweet_text'].apply(utils.remove_handles)
df_copy_1.head()

Unnamed: 0,label,location,tweet_text
0,0,"Milford, CT",50 MILLION unborn children have lost their liv...
1,0,"Atlanta, Georgia",#ProLife #abortion #Parenthood #babies #mother...
2,0,,Definitely NOT safe for the baby. #abortion
3,0,,I think she wants to see herself as #prolife a...
4,0,,YouTube censors lifesaving abortion informatio...


### Step 3: Remove non-English

In [4]:
df_copy_1['tweet_text']=df_copy_1['tweet_text'].apply(utils.remove_non_english)
df_copy_1.head()

Unnamed: 0,label,location,tweet_text
0,0,"Milford, CT",50 MILLION unborn children have lost their liv...
1,0,"Atlanta, Georgia",#ProLife #abortion #Parenthood #babies #mother...
2,0,,Definitely NOT safe for the baby. #abortion
3,0,,I think she wants to see herself as #prolife a...
4,0,,YouTube censors lifesaving abortion informatio...


### Step 4: Remove numbers

In [5]:
df_copy_1['tweet_text']=df_copy_1['tweet_text'].apply(utils.remove_numbers)
df_copy_1.head()

Unnamed: 0,label,location,tweet_text
0,0,"Milford, CT",MILLION unborn children have lost their lives...
1,0,"Atlanta, Georgia",#ProLife #abortion #Parenthood #babies #mother...
2,0,,Definitely NOT safe for the baby. #abortion
3,0,,I think she wants to see herself as #prolife a...
4,0,,YouTube censors lifesaving abortion informatio...


### Step 5: Lowercase everything

In [6]:
df_copy_1['tweet_text']=df_copy_1['tweet_text'].apply(str.lower)
df_copy_1.head()

Unnamed: 0,label,location,tweet_text
0,0,"Milford, CT",million unborn children have lost their lives...
1,0,"Atlanta, Georgia",#prolife #abortion #parenthood #babies #mother...
2,0,,definitely not safe for the baby. #abortion
3,0,,i think she wants to see herself as #prolife a...
4,0,,youtube censors lifesaving abortion informatio...


### Step 6: Remove punctuation

In [7]:
df_copy_1['tweet_text']=df_copy_1['tweet_text'].apply(utils.remove_punc)
df_copy_1.head()

Unnamed: 0,label,location,tweet_text
0,0,"Milford, CT",million unborn children have lost their lives...
1,0,"Atlanta, Georgia",#prolife #abortion #parenthood #babies #mother...
2,0,,definitely not safe for the baby #abortion
3,0,,i think she wants to see herself as #prolife a...
4,0,,youtube censors lifesaving abortion informatio...


### Step 7: Remove stop-words - only 'a', 'the'

In [8]:
df_copy_1['tweet_text']=df_copy_1['tweet_text'].apply(utils.remove_stop)
df_copy_1.head()

Unnamed: 0,label,location,tweet_text
0,0,"Milford, CT",million unborn children have lost their lives...
1,0,"Atlanta, Georgia",#prolife #abortion #parenthood #babies #mother...
2,0,,definitely not safe for baby #abortion
3,0,,i think she wants to see herself as #prolife a...
4,0,,youtube censors lifesaving abortion informatio...


### Step 8: Seperate each word by exactly 1 space

In [9]:
df_copy_1['tweet_text']=df_copy_1['tweet_text'].apply(utils.remove_double_space)
df_copy_1.head()

Unnamed: 0,label,location,tweet_text
0,0,"Milford, CT",million unborn children have lost their lives ...
1,0,"Atlanta, Georgia",#prolife #abortion #parenthood #babies #mother...
2,0,,definitely not safe for baby #abortion
3,0,,i think she wants to see herself as #prolife a...
4,0,,youtube censors lifesaving abortion informatio...


### Step 9: abortions, #abortion -> abortion

In [10]:
df_copy_1['tweet_text']=df_copy_1['tweet_text'].apply(lambda x: utils.plural_hashtag(x, "abortion"))
df_copy_1.head()

Unnamed: 0,label,location,tweet_text
0,0,"Milford, CT",million unborn children have lost their lives ...
1,0,"Atlanta, Georgia",#prolife abortion #parenthood #babies #motherh...
2,0,,definitely not safe for baby abortion
3,0,,i think she wants to see herself as #prolife a...
4,0,,youtube censors lifesaving abortion informatio...


### Step 9: Remove words outside of frequency threshold - Need to fix converting plural and hashtags

In [11]:
freq_matrix = utils.build_freq_matrix(df_copy_1)
freq_matrix.head()

Unnamed: 0_level_0,million,unborn,children,have,lost,their,lives,to,abortion,think,...,#themoreyouknow,#probirth,dumbs,discrediting,picture,subsidize,else's,bigger,grasp,ended
tweet_text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
million unborn children have lost their lives to abortion think #stevejobs #davethomas,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#prolife abortion #parenthood #babies #motherhood #pregnant #pp #plannedparenthood #genocide for money #bloodmoney #maga,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
definitely not safe for baby abortion,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
i think she wants to see herself as #prolife and pro abortion but with no empathy for unborn,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
youtube censors lifesaving abortion information #womenonwaves abortion #youtube,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
df_copy_2, lower, upper = utils.remove_thresholds(df_copy_1, freq_matrix, 0, .5)
upper.head()

Unnamed: 0,0
abortion,2878


In [13]:
df_copy_2.head()

Unnamed: 0,label,location,tweet_text
0,0,"Milford, CT",million unborn children have lost their lives ...
1,0,"Atlanta, Georgia",#prolife #parenthood #babies #motherhood #preg...
2,0,,definitely not safe for baby
3,0,,i think she wants to see herself as #prolife a...
4,0,,youtube censors lifesaving information #womeno...
