In [71]:
# Rishabh Jain
# 102083054
# 3CO28
import os

# import pandas as pd
import cudf as pd
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

import re
import string

from nltk.corpus import stopwords
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import f1_score,accuracy_score
from sklearn.model_selection import train_test_split

In [72]:
max_len = 202

In [73]:
df_train = pd.read_csv("../input/nlp-getting-started/train.csv")
pd_train = df_train.to_pandas()

In [74]:
df_train.head()

In [75]:
df_train.info()

In [76]:
# number of missing values per column in test
df_train.isna().sum()

In [77]:
# counting the number of true and false tweets
sns.catplot("target", data=pd_train, kind="count", height=8)
plt.title("Distribution of Target Counts", size=20, weight="bold")
plt.xlabel("Target Labels", size=14, weight="bold")
plt.ylabel("Counts", size=14, weight="bold")
plt.show()

In [78]:
#missing values percentage
missing_values = dict(zip([col+"_missing_percent" for col in pd_train.columns if col != "target"],
                          [round(100*pd_train[col].isnull().sum()/len(pd_train), 2) for col in pd_train.columns
                           if col != "target"]))
missing_values_df = pd.DataFrame(missing_values, index=[0]).to_pandas()
missing_values_df = missing_values_df.melt(var_name= "columns", value_name= "percentage")

plt.figure(figsize=(10, 8))
seaborn_plot = sns.barplot(x="columns", y="percentage", data=missing_values_df)
for p in seaborn_plot.patches:
    seaborn_plot.annotate(format(p.get_height(), '.2f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center',
                   va = 'center', xytext = (0, 9), textcoords = 'offset points')
plt.title("Percentage of Missing Values in Columns", size=20, weight="bold")
plt.xlabel("Columns", size=14, weight="bold")
plt.ylabel("Percentage", size=14, weight="bold")
plt.show()

In [79]:
# unique value cound
unique_values = dict(zip([col+"_unique_values" for col in pd_train.columns if col != "target"],
                          [pd_train[col].nunique() for col in pd_train.columns if col != "target"]))
unique_values_df = pd.DataFrame(unique_values, index=[0]).to_pandas()
unique_values_df = unique_values_df.melt(var_name= "columns", value_name= "counts")


plt.figure(figsize=(10, 8))
sns.set(style="whitegrid", color_codes=True)
seaborn_plot = sns.barplot(x="columns", y="counts", data=unique_values_df)
for p in seaborn_plot.patches:
    seaborn_plot.annotate(format(p.get_height(), '.2f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center',
                   va = 'center', xytext = (0, 9), textcoords = 'offset points')
plt.title("Number of Unique Values in Columns", size=20, weight="bold")
plt.xlabel("Columns", size=14, weight="bold")
plt.ylabel("Counts", size=14, weight="bold")
plt.show()

In [80]:
# tweets with duplicates
train_duplicates = pd_train[pd_train["text"].duplicated()].groupby(["text"])[["target"]].agg(list)

diff_label_idx = []
for idx, val in enumerate(train_duplicates["target"]):
    for ele in val: 
        if ele != val[0]:
            diff_label_idx.append(idx)
                
print(f"Number of Duplicated Tweets with different target labels in train: {len(diff_label_idx)}")
train_duplicates.iloc[diff_label_idx]

In [81]:
# getting all the keywords from keyword column
keywords = " ".join([keyword for keyword in pd_train["keyword"].dropna()])

word_cloud= WordCloud(width=800,
                      height=500,
                      max_font_size=112,
                      random_state=24).generate(keywords)

plt.figure(figsize=(10, 8))
plt.title("keywords", size=20, weight="bold")
plt.imshow(word_cloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [82]:
# Clean tweets by converting all text to lowercase,remove hyperlinks, hastags,punctuation and stopwords 
def clean_tweet(tweet):
    
    # lower case
    tweet = tweet.lower()
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', str(tweet))
    # remove hashtags
    tweet = re.sub(r'#', '', str(tweet))
    # remove punctuation
    punct = set(string.punctuation)
    tweet = "".join(ch for ch in tweet if ch not in punct)
    # remove stopwords
    stop_words = set(stopwords.words("english"))
    tweet = " ".join(word for word in tweet.split() if word not in stop_words)
    
    return tweet

In [83]:
df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True)
df_train['c'] = pd_train["text"].apply(clean_tweet)
pd_train["cleaned_text"] = pd_train["text"].apply(clean_tweet)

In [84]:
tokenizer = Tokenizer(oov_token="<unk>")
tokenizer.fit_on_texts(pd_train["cleaned_text"])

In [85]:
vocab_size=len(tokenizer.word_index)+1

In [86]:
X_padded = pad_sequences(tokenizer.texts_to_sequences(pd_train["cleaned_text"].values), maxlen= max_len)
y=pd_train["target"]

In [87]:
print("running")
model=tf.keras.models.Sequential([
                                tf.keras.layers.Embedding(vocab_size,400,input_length=max_len),
                                tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100,return_sequences=True)),
                                tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50)),
                                tf.keras.layers.Dropout(0.1),
                                tf.keras.layers.Dense(2,activation='softmax')

])

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

print("compiled")

In [88]:
print("training")
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.3, random_state=42)
model.fit(
    X_train,y_train,
    validation_data=(X_test,y_test),
    epochs=20,
    callbacks=[
               tf.keras.callbacks.EarlyStopping(monitor='val_accuracy',patience=3)
    ]
)
print("finished")

In [89]:
y_pred=model.predict(X_test)
y_pred.shape
y_pred=y_pred.argmax(axis=1)
print("f1 score : ", f1_score(y_test,y_pred))
print("accuracy score: ", accuracy_score(y_test, y_pred))


In [90]:
df_test=pd.read_csv("../input/nlp-getting-started/test.csv")
pd_test = df_test.to_pandas()
pd_test.loc[:,"cleaned_text"]=pd_test["text"].map(clean_tweet)
X_padded = pad_sequences(tokenizer.texts_to_sequences(pd_test["cleaned_text"].values), maxlen= max_len)
y_pred=model.predict(X_padded)
y_pred=y_pred.argmax(axis=1)
sub=pd.read_csv("../input/nlp-getting-started/sample_submission.csv")
sub["target"]=y_pred
sub.to_csv("submission.csv",index=False)
print("sub")