In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\suki9\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [3]:
# Import data
data = pd.read_csv('Dataset1_labeled_data.csv')

In [4]:
# hate speech: 0
# offensive language: 1
# neither: 2

data.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [23]:
# Size of dataset

data.shape

(24783, 8)

In [21]:
# Make a new column to highlight retweets
data['is_retweet'] = data['tweet'].apply(lambda x: x[:2]=='RT')
data['is_retweet'].sum()  # number of retweets

6484

In [28]:
# number of unique retweets
data.loc[data['is_retweet']].tweet.unique().size

6484

### Extracting substrings (@, #)

In [30]:
# Who is being retweeted?
# Who is being tweeted at/mentioned?
# What hashtags are being used?

# Create functions

def find_retweeted(tweet):
    '''This function will extract the twitter handles of retweed people'''
    return re.findall('(?<=RT\s)(@[A-Za-z]+[A-Za-z0-9-_]+)', tweet)

def find_mentioned(tweet):
    '''This function will extract the twitter handles of people mentioned in the tweet'''
    return re.findall('(?<!RT\s)(@[A-Za-z]+[A-Za-z0-9-_]+)', tweet)  

def find_hashtags(tweet):
    '''This function will extract hashtags'''
    return re.findall('(#[A-Za-z]+[A-Za-z0-9-_]+)', tweet)   

In [33]:
# Make new columns for retweeted usernames, mentioned usernames and hashtags

data['retweeted'] = data.tweet.apply(find_retweeted)
data['mentioned'] = data.tweet.apply(find_mentioned)
data['hashtags'] = data.tweet.apply(find_hashtags)

In [34]:
# Have a look at the new columns

data.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,is_retweet,retweeted,mentioned,hashtags
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,False,[@mayasolovely],[],[]
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,False,[@mleew17],[],[]
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,False,[@UrKindOfBrand],[],[]
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,False,[@C_G_Anderson],[@viva_based],[]
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,False,[@ShenikaRoberts],[],[]


#### What hashtags are being used 

In [37]:
# Take the rows from the hashtag columns where there are actually hashtags

hashtags_list_data = data.loc[
                       data.hashtags.apply(
                           lambda hashtags_list: hashtags_list !=[]
                       ),['hashtags']]
hashtags_list_data.head()

Unnamed: 0,hashtags
37,[#Shots]
41,[#SevenOne]
45,[#HappyHumpDay]
72,[#ahmesehwetness]
73,"[#Eaglesnation, #Eagles]"


In [39]:
# Create dataframe where each use of hashtag gets its own row

flattened_hashtags_data = pd.DataFrame(
    [hashtag for hashtags_list in hashtags_list_data.hashtags
    for hashtag in hashtags_list],
    columns=['hashtag'])
flattened_hashtags_data.head()

Unnamed: 0,hashtag
0,#Shots
1,#SevenOne
2,#HappyHumpDay
3,#ahmesehwetness
4,#Eaglesnation


In [42]:
flattened_hashtags_data.shape

(3490, 1)

In [40]:
# Number of unique hashtags

flattened_hashtags_data['hashtag'].unique().size

2251

In [51]:
# Count of appearances of each hashtag
popular_hashtags = flattened_hashtags_data.groupby('hashtag').size()\
                                        .reset_index(name='counts')\
                                        .sort_values('counts', ascending=False)\
                                        .reset_index(drop=True)
popular_hashtags.head(10)

Unnamed: 0,hashtag,counts
0,#Yankees,94
1,#iphone,50
2,#ipad,48
3,#sex,48
4,#android,48
5,#porn,48
6,#xxx,48
7,#tcot,41
8,#hoosiers,23
9,#morningjoe,22


#### Who is being retweeted 

In [52]:
# Take the rows from the retweeted columns where there are actually hashtags

retweeted_list_data = data.loc[
                       data.retweeted.apply(
                           lambda retweeted_list: retweeted_list !=[]
                       ),['retweeted']]
retweeted_list_data.head()


Unnamed: 0,retweeted
0,[@mayasolovely]
1,[@mleew17]
2,[@UrKindOfBrand]
3,[@C_G_Anderson]
4,[@ShenikaRoberts]


In [55]:
# Create dataframe where each use of retweeted gets its own row

flattened_retweeted_data = pd.DataFrame(
    [retweeted for retweeted_list in retweeted_list_data.retweeted
    for retweeted in retweeted_list],
    columns=['retweeted'])
flattened_retweeted_data.head()


Unnamed: 0,retweeted
0,@mayasolovely
1,@mleew17
2,@UrKindOfBrand
3,@C_G_Anderson
4,@ShenikaRoberts


In [56]:
flattened_retweeted_data.shape

(7073, 1)

In [57]:
# Number of unique retweeted

flattened_retweeted_data['retweeted'].unique().size

5296

In [58]:
# Count of appearances of each retweeted
popular_retweeted = flattened_retweeted_data.groupby('retweeted').size()\
                                        .reset_index(name='counts')\
                                        .sort_values('counts', ascending=False)\
                                        .reset_index(drop=True)
popular_retweeted.head(10)

Unnamed: 0,retweeted,counts
0,@iDntWearCondoms,25
1,@Yankees,24
2,@JoeBudden,23
3,@CauseWereGuys,22
4,@KINGTUNCHI_,19
5,@FriendlyAssh0le,19
6,@KingHorseDick,19
7,@FunnyPicsDepot,15
8,@CuhCuhCuh,15
9,@SteveStfler,14


#### Who is being tweeted at/mentioned

In [60]:
# Take the rows from the mentioned columns where there are actually hashtags

mentioned_list_data = data.loc[
                       data.mentioned.apply(
                           lambda mentioned_list: mentioned_list !=[]
                       ),['mentioned']]
mentioned_list_data.head()



Unnamed: 0,mentioned
3,[@viva_based]
5,[@T_Madison_x]
7,[@selfiequeenbri]
9,[@rhythmixx_]
52,[@DaRealKha]


In [61]:
# Create dataframe where each use of mentioned gets its own row

flattened_mentioned_data = pd.DataFrame(
    [mentioned for mentioned_list in mentioned_list_data.mentioned
    for mentioned in mentioned_list],
    columns=['mentioned'])
flattened_mentioned_data.head()

Unnamed: 0,mentioned
0,@viva_based
1,@T_Madison_x
2,@selfiequeenbri
3,@rhythmixx_
4,@DaRealKha


In [62]:
flattened_mentioned_data.shape

(10958, 1)

In [63]:
# Number of unique retweeted

flattened_mentioned_data['mentioned'].unique().size

7039

In [64]:
# Count of appearances of each retweeted
popular_mentioned = flattened_mentioned_data.groupby('mentioned').size()\
                                        .reset_index(name='counts')\
                                        .sort_values('counts', ascending=False)\
                                        .reset_index(drop=True)
popular_mentioned.head(10)

Unnamed: 0,mentioned,counts
0,@kieffer_jason,101
1,@Yankees,46
2,@Huntermoore,30
3,@Buckm00se,23
4,@sbsylvester,22
5,@what_evaittakes,20
6,@VoiceOfDStreetz,19
7,@JawShoeeAhhh,18
8,@Leelucas_,18
9,@viaNAWF,18


### Cleaning Unstructured Text Data

In [19]:
# Create functions

def remove_links(tweet):
    '''Takes a string and removes web links from it'''
    tweet = re.sub(r'http\S+', '', tweet) # remove http links
    tweet = re.sub(r'bit.ly/\S+', '', tweet) # rempve bitly links
    tweet = tweet.strip('[link]') # remove [links]
    return tweet

def remove_users(tweet):
    '''Takes a string and removes retweet and @user information'''
    tweet = re.sub('(RT\s@[A-Za-z0-9-_]+[A-Za-z0-9-_]+)', '', tweet) # remove retweet
    tweet = re.sub('(@[A-Za-z0-9-_]+[A-Za-z0-9-_]+)', '', tweet) # remove tweeted at
    return tweet

In [22]:
my_stopwords = nltk.corpus.stopwords.words('english')
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@#'

# cleaning master function
def clean_tweet(tweet, bigrams=False):
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    tweet = tweet.lower() # lower case
    tweet = re.sub('['+my_punctuation + ']+', ' ', tweet) # strip punctuation
    tweet = re.sub('\s+', ' ', tweet) #remove double spacing
    tweet = re.sub('([0-9]+)', '', tweet) # remove numbers
    tweet = re.sub('amp', '', tweet) # remove amp
    tweet_token_list = [word for word in tweet.split(' ')
                            if word not in my_stopwords] # remove stopwords

    tweet_token_list = [word_rooter(word) if '#' not in word else word
                        for word in tweet_token_list] # apply word rooter
    if bigrams:
        tweet_token_list = tweet_token_list+[tweet_token_list[i]+'_'+tweet_token_list[i+1]
                                            for i in range(len(tweet_token_list)-1)]
    tweet = ' '.join(tweet_token_list)
    return tweet

In [25]:
data['clean_tweet'] = data.tweet.apply(clean_tweet)
data.head(30)

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,clean_tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,woman complain clean hous man alway take trash
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,boy dat cold tyga dwn bad cuffin dat hoe st p...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,dawg ever fuck bitch start cri confus shit
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,look like tranni
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,shit hear might true might faker bitch told ya
5,5,3,1,2,0,1,"!!!!!!!!!!!!!!!!!!""@T_Madison_x: The shit just...",shit blow claim faith somebodi still fuck hoe...
6,6,3,0,3,0,1,"!!!!!!""@__BrighterDays: I can not just sit up ...",sit hate anoth bitch got much shit go
7,7,3,0,3,0,1,!!!!&#8220;@selfiequeenbri: cause I'm tired of...,caus tire big bitch come us skinni girl
8,8,3,0,3,0,1,""" &amp; you might not get ya bitch back &amp; ...",might get ya bitch back that
9,9,3,1,2,0,1,""" @rhythmixx_ :hobbies include: fighting Maria...",hobbi includ fight mariam bitch


In [27]:
# Split our data into a training set and a testing set

train, test = train_test_split(data, test_size=0.3, random_state=42)

### Logistic regression model

#### Basic model

In [28]:
# The vectorizer object will be used to transform text to vector form
basicvectorizer = CountVectorizer()

# Apply transformation
basictrain = basicvectorizer.fit_transform(train['clean_tweet'])

print(basictrain.shape)  # 13613 different words


(17348, 12937)


In [29]:
#  Train a logistic regression model

basicmodel = LogisticRegression()
basicmodel = basicmodel.fit(basictrain, train["class"])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [30]:
#  Set up the test data

basictest = basicvectorizer.transform(test['clean_tweet'])
predictions = basicmodel.predict(basictest)

In [31]:
# Results

pd.crosstab(test["class"], predictions, rownames=["Actual"], colnames=["Predicted"])

Predicted,0,1,2
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,100,282,45
1,95,5479,173
2,14,130,1117
