In [37]:
import csv
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import re
import string
from itertools import groupby
import io
import nltk as nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer

In [38]:
filename = "Combined US"

In [39]:
# MAKING EVERYTHING LOWERCASE

df = pd.read_csv(filename + ".csv", encoding='utf-8-sig')
df['full text'] = df['full text'].str.lower()
df.to_csv(filename + " cleaned.csv", index = False, encoding='utf-8-sig')
print(df['full text'])

0       connected and automated vehicles and new tech...
1       examples of mdot efforts in this field , whic...
2       implementation and test facilities through pl...
3       new technology e-construction: mdot’s e -cons...
4       emergency response, coordinate information fo...
                             ...                        
707      carmasm and cooperative driving automation c...
708     virtual open innovation collaborative environ...
709                                        what’s next? 
710     what’s next? automated driving systems have t...
711                   www.transportation.gov/av january 
Name: full text, Length: 712, dtype: object


In [40]:
df = pd.read_csv(filename + " cleaned.csv", encoding='utf-8-sig')
df = df.drop(df.columns[[1]], axis=1, inplace = False)
#df = df.drop(df.columns[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35]], axis=1, inplace = False)
df.to_csv(filename + " cleaned.csv", index = False, encoding='utf-8-sig')

In [41]:
# DELETING MENTIONS, HASHTAGS, AND LINKS

df = pd.read_csv(filename + " cleaned.csv", encoding='utf-8-sig')

def remove_words(in_list, char_list):
    new_list = []
    for line in in_list:
        new_words = ' '.join([word for word in line.split() if not any([phrase in word for phrase in char_list])])
        new_list.append(new_words)
    return new_list
     
str_list = df['full text']
char_list = ['www', 'https']
df['full text'] = remove_words(str_list, char_list)
df.to_csv(filename + " cleaned.csv", index = False, encoding='utf-8-sig')
print(df['full text'])

0      connected and automated vehicles and new techn...
1      examples of mdot efforts in this field , which...
2      implementation and test facilities through pla...
3      new technology e-construction: mdot’s e -const...
4      emergency response, coordinate information for...
                             ...                        
707    carmasm and cooperative driving automation car...
708    virtual open innovation collaborative environm...
709                                         what’s next?
710    what’s next? automated driving systems have th...
711                                              january
Name: full text, Length: 712, dtype: object


In [42]:
# DELETING WHITESPACE

df = pd.read_csv(filename + " cleaned.csv", encoding='utf-8-sig', skipinitialspace = True)
df = df.replace(r'\n',' ', regex=True)
df['full text'] = df['full text'].replace(r'\s+', ' ', regex=True)
df.dropna(subset = ['full text'], inplace=True)
df.to_csv(filename + " cleaned.csv", index = False, encoding='utf-8-sig')

In [43]:
# DEFINES THE STOPWORDS

new_stopwords = ["im", "u", "&", 'amp', 'us', 'may', 'thing', 'isnt', 'dont', "i'm", "i’m", "we've", 's', 't', 'c', 'also', 'page', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'et', 'al', 'uni']                 
stop_words = nltk.corpus.stopwords.words('english')
stop_words.extend(new_stopwords)

print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [44]:
# REMOVING STOPWORDS

df = pd.read_csv(filename + " cleaned.csv", encoding='utf-8-sig')
pos_tweets = df['full text']

test = pd.DataFrame(pos_tweets)

# Exclude stopwords with Python's list comprehension and pandas.DataFrame.apply.
df['full text'] = df['full text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
#df['tweet'] = test['tweet_without_stopwords']
print(df['full text'])
df.to_csv(filename + " cleaned.csv", index = False, encoding='utf-8-sig')

0      connected automated vehicles new technology wh...
1      examples mdot efforts field , involve infrastr...
2      implementation test facilities planet planet r...
3      new technology e-construction: mdot’s -constru...
4      emergency response, coordinate information pla...
                             ...                        
707    carmasm cooperative driving automation carma f...
708    virtual open innovation collaborative environm...
709                                         what’s next?
710    what’s next? automated driving systems potenti...
711                                              january
Name: full text, Length: 712, dtype: object


In [45]:
# REMOVES PUNCTUATION

df = pd.read_csv(filename + " cleaned.csv", encoding='utf-8-sig')
df['full text'] = df['full text'].str.replace('[^\w\s]',' ')
#df['hashtags'] = df['hashtags'].str.replace('[^\w\s]','')
df.to_csv(filename + " cleaned.csv", index = False, encoding='utf-8-sig')

  df['full text'] = df['full text'].str.replace('[^\w\s]',' ')


In [46]:
# DELETING WHITESPACE #2

df = pd.read_csv(filename + " cleaned.csv", encoding='utf-8-sig', skipinitialspace = True)
df = df.replace(r'\n',' ', regex=True)
df['full text'] = df['full text'].replace(r'\s+', ' ', regex=True)
df.dropna(subset = ['full text'], inplace=True)
df.to_csv(filename + " cleaned.csv", index = False, encoding='utf-8-sig')

In [47]:
# DELETING NUMBERS

df = pd.read_csv(filename + " cleaned.csv", encoding='utf-8-sig')
df['full text'] = df['full text'].str.replace('\d+', '')
df.to_csv(filename + " cleaned.csv", index = False, encoding='utf-8-sig')

  df['full text'] = df['full text'].str.replace('\d+', '')


In [48]:
# # DELETING OTHER DUPE TWEETS

# df = pd.read_csv(filename + " cleaned.csv", encoding='utf-8-sig')
# df = df[df["tweet"].str.contains("poltergeist") == False]
# df = df[df["tweet"].str.contains("cliche things unavoidable life death taxes probably add new technology list artificial intelligence") == False]
# df = df[df["tweet"].str.contains("environmental tradeoffs autonomous vehicles convenience likely come") == False]
# df = df[df["tweet"].str.contains("tory mps squealing big brother cockpit putting risk lives british roads also undermining uk leadership promoting future deployment connected autonomous vehicles cavs") == False]
# #df = df[df["tweet"].str.contains("concerns about safe deployment of autonomous vehicles aired at congressional hearing") == False]
# #df = df[df["tweet"].str.contains("to make the most of autonomous vehicles advantages and avoid the disadvantages we must choose to shape our cities") == False]
# #df = df[df["tweet"].str.contains("autonomous vehicles hold great promise to deliver significant benefits for all americans  but only if the federal government puts the necessary policies in place to achieve these benefits") == False]
# #df = df[df["tweet"].str.contains("semiautonomous vehicles are creating an injury litigation risk for insurers") == False]
# #df = df[df["tweet"].str.contains("the cliche is that the only things unavoidable in life are death and taxes we can probably add new technology to the list too artificial intelligence") == False]
# #df = df[df["tweet"].str.contains("tory mps squealing about big brother in the cockpit are not just putting at risk lives on british roads but are also undermining uk leadership in promoting the future deployment of connected and autonomous vehicles cavs") == False]
# #df = df[df["tweet"].str.contains("3 security issues facing selfdriving cars") == False]
# #df = df[df["tweet"].str.contains("poltergeist") == False]
# #df = df[df["tweet"].str.contains("poltergeist") == False]
# #df = df[df["tweet"].str.contains("poltergeist") == False]
# #df = df[df["tweet"].str.contains("poltergeist") == False]
# sizeafter = df.shape
# print('size before =', sizeinitial)
# print('size after =', sizeafter)
# df.to_csv(filename + " cleaned.csv", index = False, encoding='utf-8-sig')

In [49]:
# # DELETING DUPLICATE TWEETS

# df = pd.read_csv(filename + " cleaned.csv", encoding='utf-8-sig')
# df = df.drop_duplicates(subset=['tweet'], inplace=False)
# sizeafter = df.shape
# print('size before =', sizeinitial)
# print('size after =', sizeafter)
# df.to_csv(filename + " cleaned.csv", index = False, encoding='utf-8-sig')

In [50]:
# REMOVING STOPWORDS #2

df = pd.read_csv(filename + " cleaned.csv", encoding='utf-8-sig')
pos_tweets = df['full text']

test = pd.DataFrame(pos_tweets)

# Exclude stopwords with Python's list comprehension and pandas.DataFrame.apply.
df['full text'] = df['full text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
#df['tweet'] = test['tweet_without_stopwords']
print(df['full text'])
df.to_csv(filename + " cleaned.csv", index = False, encoding='utf-8-sig')

0      connected automated vehicles new technology wh...
1      examples mdot efforts field involve infrastruc...
2      implementation test facilities planet planet r...
3      new technology construction mdot construction ...
4      emergency response coordinate information plan...
                             ...                        
706    carmasm cooperative driving automation carma f...
707    virtual open innovation collaborative environm...
708                                                 next
709    next automated driving systems potential signi...
710                                              january
Name: full text, Length: 711, dtype: object


In [51]:
# DELETING WHITESPACE #3

df = pd.read_csv(filename + " cleaned.csv", encoding='utf-8-sig', skipinitialspace = True)
df = df.replace(r'\n',' ', regex=True)
df['full text'] = df['full text'].replace(r'\s+', ' ', regex=True)
df.dropna(subset = ['full text'], inplace=True)
df.to_csv(filename + " cleaned.csv", index = False, encoding='utf-8-sig')

In [52]:
# # ONLY TWEETS

# df = pd.read_csv(filename + " cleaned.csv", encoding='utf-8-sig')
# df = df.drop(df.columns[[0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11]], axis=1, inplace = False)
# df.to_csv(filename + " only tweets cleaned.csv", index = False, encoding='utf-8-sig')