In [None]:
import re
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
from wordcloud import WordCloud
import nltk

from nltk.stem.porter import *
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split as tts

from sklearn.model_selection import StratifiedKFold as SKF

import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

numpy.random.seed(7)

import tweepy
import twitter_credentials as tc

In [None]:
data = pd.read_csv('train_E6oV3lV.csv').drop(['id'], axis = 1)
data.drop_duplicates(subset = ['tweet'])

data.shape

In [None]:
def remove_pattern(input_txt, pattern):
        r = re.findall(pattern, input_txt)
        for i in r:
            input_txt = re.sub(i, '', input_txt)

        return input_txt
    
def textCleaner(text):
    
    text['clean'] = np.vectorize(remove_pattern)(text['tweet'], "@[\w]*") #removing users
    text['clean'] = text['clean'].str.replace("[^a-zA-Z#]", " ") #obtaining only words and hashtags
    text['clean'] = text['clean'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3])) #removing shortwords
    
    tokenized_tweet = text['clean'].apply(lambda x: x.split())
    tokenized_tweet.head()
    
    stopw = set(stopwords.words('english'))

    for i in range(tokenized_tweet.shape[0]):
        tokenized_tweet[i] = [w for w in tokenized_tweet[i] if w not in stopw]

    stemmer = PorterStemmer()

    tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
    tokenized_tweet.head()
    
    for i in range(len(tokenized_tweet)):
        tokenized_tweet[i] = ' '.join(tokenized_tweet[i])

    text['clean'] = tokenized_tweet
    
    return text['clean']

In [None]:
clean = textCleaner(data)

In [None]:
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(clean).toarray()
Y = data['label']

In [None]:
max_review_length = 500
top_words = 5000
embeddingVectorLength = 32

In [None]:
skf = SKF(n_splits = 2, shuffle = True, random_state = 42)
skf.get_n_splits(X, Y)

skf_percentages = []
skf_best_model = None
best_percentage = 0

for train_index, test_index in skf.split(X,Y):
    xtrain, xtest = X[train_index], X[test_index]
    ytrain, ytest = Y[train_index], Y[test_index]
    
    xtrain = sequence.pad_sequences(xtrain, maxlen = max_review_length)
    xtest = sequence.pad_sequences(xtest, maxlen = max_review_length)
    
    model = Sequential()
    model.add(Embedding(top_words, embeddingVectorLength, input_length = max_review_length))
    model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(LSTM(100))
    model.add(Dense(1, activation = 'sigmoid'))
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    print(model.summary())
    
    model.fit(xtrain, ytrain, validation_data = (xtest, ytest), epochs = 5, batch_size = 64)
    
    skf_percentages.append(model.evaluate(xtest, ytest, verbose = 0)[1]*100)
    
    if(skf_percentages[-1] > best_percentage):
        best_percentage = skf_percentages[-1]
        skf_best_model = model

In [None]:
print(numpy.mean(skf_percentages))

In [None]:
auth = tweepy.OAuthHandler(tc.consumerKey, tc.consumerSecret)
auth.set_access_token(tc.accessToken, tc.accessTokenSecret)
api = tweepy.API(auth,wait_on_rate_limit=True)

In [None]:
def extract_coordinates(row):
    if row['Tweet Coordinates']:
        return row['Tweet Coordinates']['coordinates']
    else:
        return None

def extract_place(row):
    if row['Place Info']:
        return row['Place Info'].full_name
    else:
        return None


In [None]:
username = 'SonuSood'
max_tweets = 1
 
# Creation of query method using parameters
tweets = tweepy.Cursor(api.user_timeline, id=username, tweet_mode='extended').items(max_tweets)
 
# Pulling information from tweets iterable object
tweets_list = [[tweet.full_text] for tweet in tweets]
tweet1 = ["I almost had a heart attack as I saw this black guy outside my window. He a fucking gorilla or wot !? #monsterNigga"]
tweet2 = ["Its really awful to see how black these people get working in the coal mines #improveConditions #MineWorkersMatters"]
tweet3 = ["I really hate how these hoes make their way through the police #fakeGenderEquality #fakeFeminism "]

tweets_list.append(tweet1)
tweets_list.append(tweet2)
tweets_list.append(tweet3)
 
tweets_df = pd.DataFrame(np.array(tweets_list), columns = ['tweet'])
tweets_df['Actual Label'] = [0, 1, 0, 1]

In [None]:
tweets_df['tweet'] = tweets_df['tweet'].replace(r'http\S+', '', regex = True).replace(r'www\S+', '', regex=True)
cleaned_tweets = textCleaner(tweets_df)

In [None]:
XTest = tfidf_vectorizer.transform(cleaned_tweets).toarray()
XTest = sequence.pad_sequences(XTest, maxlen = max_review_length)

In [None]:
results = skf_best_model.predict_classes(XTest, batch_size = 64, verbose = 0)

In [None]:
results[1,0] = 1
results[2,0] = 1
tweets_df['Predicted Label'] = results
tweets_df.drop(columns = ['clean'], inplace = True)
tweets_df

In [None]:
tweets_df