In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import wordnet

In [3]:
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
df = pd.read_csv('training.1600000.processed.noemoticon.csv',encoding = "ISO-8859-1",names=DATASET_COLUMNS)

In [4]:
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [5]:
df=df.drop(columns=['ids', 'date','user','flag'],axis=1)

In [6]:
df.shape

(1600000, 2)

In [7]:
df.head()

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [8]:

url = '((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*'
# Convert to Lower
def to_lower(tweet):
    tweets = [word.lower() for word in tweet]
    return tweets

#####################################################################################

# drop stop words
def drop_stop_words(tweets):
    stop_words = set(stopwords.words('english'))
    punctuation = list(string.punctuation)
    stop_words.update(punctuation)
    tweets     = [' '.join(word for word in word_tokenize(tweet) if word not in stop_words) for tweet in tweets]
    return tweets

# Drop Qoute
def Drop_Qoute(tweets):
#     tweets = [re.sub('^&(\w)+;','',tweet) for tweet in tweets]
    tweets = [' '.join([re.sub('^&(\w)+;',' && ',tw) for tw in tweet.split()]) for tweet in tweets] 

    return tweets

# Drop Reetweet
def drop_retweet(tweets):
    # drop retweet name
    tweets = [' '.join([re.sub('^@(\w)+',' ',tw) for tw in tweet.split()]) for tweet in tweets] 
    
    return tweets

def drop_links(tweets):
    # Drop URL from text
    tweets = [' '.join([re.sub(url,' ',tw) for tw in tweet.split()]) for tweet in tweets] 
#     tweets = [' '.join([re.sub('^http?:\/\/.*[\r\n]*',' ',tw) for tw in tweet.split()]) for tweet in tweets] 

    return tweets

def clean_spaces(tweets):
    tweets = [" ".join(tweet.split()) for tweet in tweets]
    return tweets


def stem_tweets(tweets):
    pr  =PorterStemmer() 
    tweets = [' '.join(pr.stem(word) for word in tweet.split()   ) for tweet in tweets]
    return tweets

################################################################################

def lemtize_tweets(tweets):
    lemmatizer = WordNetLemmatizer()
    
    tweets = [' '.join(lemmatizer.lemmatize(word) for word in tweet.split()   ) for tweet in tweets]

    return tweets


In [9]:
def drop_all(text):
    # Convert Text To Lower 
    text = to_lower(text)

    # Drop Retweets User Name
    text = drop_retweet(text)

    # Drop Links From Tweets
    text = drop_links(text)

    # Drop HTML tag From Tweets
    text = Drop_Qoute(text)

    # Drop Stop Words 
    text = drop_stop_words(text)
    
    # lematize Words
    text = lemtize_tweets(text)
    # Clean MUlit Spaces
    text = clean_spaces(text)
    
    return text
    

In [10]:
def Test_on_sample(sample_size,pipeline,model):
    # Take Sample
    sample = df.sample(sample_size).copy()
    
    x = sample['text'].copy()
    y = sample['target'].copy()
    
    x_train , x_test , y_train , y_test = train_test_split\
    (x,y,test_size=0.2,random_state=42)
    my_pipe = Pipeline(pipeline)
    my_pipe.fit(x_train,y_train)
    y_hat = my_pipe.predict(x_test)
    print(classification_report(y_test,y_hat))

    
    del  x_train , x_test , y_train , y_test ,x,y ,y_hat

In [11]:
from sklearn.preprocessing import FunctionTransformer
to_lower_pipe = FunctionTransformer(to_lower)

drop_retweet_pipe = FunctionTransformer(drop_retweet)

drop_links_pipe = FunctionTransformer(drop_links)

Drop_Qoute_pipe = FunctionTransformer(Drop_Qoute)

drop_stop_words_pipe = FunctionTransformer(drop_stop_words)


clean_spaces_pipe = FunctionTransformer(clean_spaces)

stem_tweets_pipe = FunctionTransformer(stem_tweets)

lemtize_tweets_pipe = FunctionTransformer(lemtize_tweets)

In [12]:
#  create model
from sklearn.pipeline import Pipeline
import re 
import nltk
#nltk.download('wordnet')

lr = LogisticRegression()
# list of function for pipeline
lr_pipe = [
    ('lower',to_lower_pipe),
    ('retweet',drop_retweet_pipe),
    ('links',drop_links_pipe),
    ('lematization',lemtize_tweets_pipe),
    ('spaces',clean_spaces_pipe),
    ('counts',TfidfVectorizer(use_idf=True,ngram_range=(1,2))),
    ('model',lr)          
]
# Train and Test Evaluate
Test_on_sample(1500,lr_pipe,lr)

              precision    recall  f1-score   support

           0       0.70      0.75      0.73       154
           4       0.72      0.66      0.69       146

    accuracy                           0.71       300
   macro avg       0.71      0.71      0.71       300
weighted avg       0.71      0.71      0.71       300



In [15]:
x_train.shape

NameError: name 'x_train' is not defined

In [None]:
classifier= KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2 ) 

Test_on_sample(1500,lr_pipe,classifier)



In [None]:
from keras.models import Sequential
import tensorflow as tf

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(50000, 16, input_length=35),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20)),
    tf.keras.layers.Dense(6, activation='softmax')
])
Test_on_sample(1500,lr_pipe,model)