<a href="https://colab.research.google.com/github/horsinnaround/Works/blob/main/work%2040%20disaster-tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import numpy as np
from PIL import Image
from wordcloud import WordCloud,STOPWORDS
from collections import Counter
from itertools import chain
import tensorflow as tf
from transformers import XLNetTokenizer, TFXLNetForSequenceClassification
from transformers import XLNetTokenizer, TFXLNetModel
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import roc_curve, auc, precision_recall_curve,average_precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import cohen_kappa_score, matthews_corrcoef, log_loss
from tensorflow.keras.callbacks import EarlyStopping,ReduceLROnPlateau
from tensorflow.keras.utils import plot_model
%matplotlib inline

# Read Data

In [None]:
df=pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
wordcloud_mask=np.array(Image.open("/kaggle/input/wodcloud-twiter-pic/twitter.png"))

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
df.drop(["id","keyword","location"], axis=1,inplace=True)

In [None]:
df.head()

### You are predicting whether a given tweet is about a real disaster or not. If so, predict a 1. If not, predict a 0.

In [None]:
df.target.value_counts()

In [None]:
plt.figure(figsize=(7,5))
sns.countplot(data=df, y="target",palette=["navy","crimson"])
plt.title("Compare Target")
plt.show()

# Count Character

In [None]:
df["character_count"]=df["text"].str.len()

# Count Word

In [None]:
df['word_count'] = df['text'].str.split().str.len()


# Count Sentence

In [None]:
df['sentence_count'] = df['text'].apply(lambda x: len(nltk.sent_tokenize(x)))


In [None]:
cor=df[["target","character_count","word_count","sentence_count"]].corr()
plt.figure(figsize=(7,5))
sns.heatmap(cor,annot=True,cmap="jet",linewidths=7, linecolor='darkorange')
plt.show()

# Data Length

In [None]:
notdisaster_len=df[df["target"]==0]["text"].str.len()
disaster_len=df[df["target"]==1]["text"].str.len()
plt.figure(figsize=(7,5))
plt.hist(notdisaster_len, bins=40,label='Not_Disaster',color="red")
plt.hist(disaster_len , bins=40, label='Disaster',color="green")
plt.title("Disaster Vs Non Disaster\n",fontsize=20,color="brown")
plt.legend()
plt.show()

# ALL Data Wordcloud

In [None]:
plt.figure(figsize=(15,15))
all_text=" ".join(df['text'].values.tolist())
wordcloud = WordCloud(width=800, height=800,stopwords=STOPWORDS, background_color='orange', max_words=800,colormap="ocean",mask=wordcloud_mask).generate(all_text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# Disaster Data Wordcloud

In [None]:
plt.figure(figsize=(15,15))
disaster_wordcloud=df[df["target"]==1]
disaster_text=" ".join(disaster_wordcloud['text'].values.tolist())
wordcloud = WordCloud(width=800, height=800,stopwords=STOPWORDS, background_color='navy', max_words=800,colormap="hsv",mask=wordcloud_mask).generate(disaster_text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# Non Disaster Wordcloud

In [None]:
plt.figure(figsize=(15,15))
nondisaster_wordcloud=df[df["target"]==0]
nondisaster_text=" ".join(nondisaster_wordcloud['text'].values.tolist())
wordcloud = WordCloud(width=800, height=800,stopwords=STOPWORDS, background_color='teal', max_words=800,colormap="CMRmap",mask=wordcloud_mask).generate(nondisaster_text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# 30 Most common Words From All Text

In [None]:
data_set =df["text"].str.split()
all_words = list(chain.from_iterable(data_set))
counter = Counter(all_words)
common_words = counter.most_common(30)
df_common_words = pd.DataFrame(common_words, columns=['Word', 'Count'])

colors = ["cyan", "lime", "magenta", "gold", "purple", "tomato", "teal", "sandybrown", "mediumseagreen",
          "royalblue", "darkorchid", "darkturquoise", "darkgoldenrod", "mediumvioletred", "mediumaquamarine",
          "lightcoral", "darkslategray", "olivedrab", "dodgerblue", "indianred", "limegreen", "steelblue",
          "darkviolet", "chocolate", "mediumslateblue", "darkgreen", "orangered", "mediumblue", "peru", "mediumspringgreen"]

plt.figure(figsize=(12, 6))
sns.barplot(x='Count', y='Word', data=df_common_words, palette=colors)
plt.title('30 Most Common Words')
plt.xlabel('Count')
plt.ylabel('Word')
plt.show()


# Most Common Words From Non Disaster Text

In [None]:
no_disaster_text = df[df["target"] == 0]
data_set = no_disaster_text["text"].str.split()
all_words = [word for sublist in data_set for word in sublist]
counter = Counter(all_words)
common_words = counter.most_common(30)
df_common_words = pd.DataFrame(common_words, columns=['Word', 'Count'])
plt.figure(figsize=(12, 8))
sns.barplot(x='Count', y='Word', data=df_common_words,palette="Set1")
plt.title('30 Most Common Words Non Disaster')
plt.xlabel('Count Non Disaster')
plt.ylabel('Non Disaster Word')
plt.show()



# 30 Most Common Words From Disaster Text

In [None]:
disaster_text = df[df["target"] == 1]
data_set = disaster_text["text"].str.split()
all_words = [word for sublist in data_set for word in sublist]
counter = Counter(all_words)
common_words = counter.most_common(30)
df_common_words = pd.DataFrame(common_words, columns=['Word', 'Count'])
plt.figure(figsize=(12, 8))
colors = ["red", "green", "blue", "yellow", "orange", "purple", "pink", "brown", "gray",
          "cyan", "magenta", "teal", "olive", "lime", "navy", "maroon", "aquamarine",
          "coral", "gold", "indigo", "lavender", "salmon", "silver", "tan", "violet",
          "crimson", "darkgreen", "darkblue", "darkorange", "deeppink"]
sns.barplot(x='Count', y='Word', data=df_common_words,palette=colors)
plt.title('30 Most Common Words Disaster')
plt.xlabel('Count  Disaster')
plt.ylabel('Disaster Word')
plt.show()


# Character Count

In [None]:
color=["red","green"]
plt.figure(figsize=(7,5))

sns.histplot(data=df,x="character_count",palette=color,kde=True,hue="target",element="step",stat="density")
plt.title("Character Count",fontsize=15,color="darkblue")
plt.show()

# Word Count

In [None]:
color=["navy","darkorange"]
plt.figure(figsize=(7,5))

sns.histplot(data=df,x="word_count",palette=color,kde=True,hue="target",element="poly",stat="density")
plt.title("Word Count",fontsize=15,color="darkblue")
plt.show()

# Sentence Count

In [None]:
color=["purple","black"]
plt.figure(figsize=(7,5))
sns.histplot(data=df,x="sentence_count",palette=color,kde=True,hue="target",element="step",stat="density")
plt.title("Sentence Count",fontsize=15,color="darkblue")
plt.show()

In [None]:
df.drop(["character_count","word_count","sentence_count"],axis=1,inplace=True)

# Average Sentence Length

In [None]:
avg_len=df["text"].apply(len)
avg_len=avg_len.mean()
print(f"Average Text Length is : {avg_len:.2f}")

# Maximum Length

In [None]:
max_len=100

In [None]:

text_data=df["text"]
label_data=df["target"]
label_data=np.array(label_data)

# XLNET Tokenizer,Model

In [None]:
xlnet_tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
xlnet_model = TFXLNetModel.from_pretrained('xlnet-base-cased')



# Encode Data

In [None]:
def xlnet_encode(final_text_data, max_len):
    input_ids = []
    attention_masks = []

    for i in range(len(final_text_data)):
        encode_data = xlnet_tokenizer.encode_plus(final_text_data[i], add_special_tokens=True, max_length=max_len, pad_to_max_length=True, return_attention_mask=True)
        input_ids.append(encode_data['input_ids'])
        attention_masks.append(encode_data["attention_mask"])

    return np.array(input_ids), np.array(attention_masks)


In [None]:
text_input_ids,text_attention_masks = xlnet_encode(text_data,max_len)

In [None]:
print('Text Input Ids Shape {} \nText Input Attention Mask Shape {} \nLabel Data shape {}'.format(text_input_ids.shape,text_attention_masks.shape,label_data.shape))

In [None]:
X_train,X_test,Y_train,Y_test,train_mask,test_mask=train_test_split(text_input_ids,label_data,text_attention_masks,test_size=0.2,random_state=42,shuffle=True)

In [None]:
print('Train input shape {}\nTest input shape {}\nTrain label shape {}\nTest label shape {}\nTrain attention mask shape {}\nTest attention mask shape {}'.format(X_train.shape,X_test.shape,Y_train.shape,Y_test.shape,train_mask.shape,test_mask.shape))

# Create XLNET Model

In [None]:
def Create_XLNet_Model():
    input_ids = tf.keras.Input(shape=(max_len,), dtype="int32")
    attention_masks = tf.keras.Input(shape=(max_len,), dtype="int32")

    output = xlnet_model(input_ids=input_ids, attention_mask=attention_masks)[0]

    output = tf.keras.layers.GlobalMaxPooling1D()(output)

    dense_1 = tf.keras.layers.Dense(256, activation="tanh")(output)
    dense_2 = tf.keras.layers.Dense(128, activation="tanh")(dense_1)

    dense_3 = tf.keras.layers.Dense(64, activation="tanh")(dense_2)

    dropout = tf.keras.layers.Dropout(0.2)(dense_3)
    output_layer = tf.keras.layers.Dense(1, activation="sigmoid")(dropout)

    model = tf.keras.models.Model(inputs=[input_ids, attention_masks], outputs=output_layer)
    return model





In [None]:
model = Create_XLNet_Model()

learning_rate = 1e-5
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

loss = "binary_crossentropy"
metric = "accuracy"

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

model.summary()

In [None]:
plot_model(model, to_file='model_architecture.png', show_shapes=True)
plt.figure(figsize=(25, 15))
img = plt.imread('model_architecture.png')
plt.imshow(img)
plt.axis('off')
plt.show()


In [None]:

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)


In [None]:
history = model.fit(x=[X_train, train_mask],y=Y_train,validation_data=([X_test, test_mask], Y_test),
                    epochs=30,batch_size=32,callbacks=[early_stopping])

In [None]:
plt.figure(figsize=(7,5))
plt.plot(history.history['loss'],lw=2,marker="*",markersize=10,color="red",label="Loss")
plt.plot(history.history['val_loss'],lw=2,marker="P",markersize=10,color="green",label="Validation_Loss")
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(loc="best")
plt.show()

In [None]:
plt.figure(figsize=(7,5))
plt.plot(history.history['accuracy'],lw=2,marker="H",markersize=10,color="purple",label="Accuracy")
plt.plot(history.history['val_accuracy'],lw=2,marker="D",markersize=10,color="crimson",label="Validation_Accuracy")
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(loc="best")
plt.show()

# Model predict

In [None]:
pred = model.predict([X_test, test_mask])
final_pred = np.where(pred >= 0.5, 1, 0)


# ROC Curve

In [None]:
fpr, tpr, thresholds = roc_curve(Y_test,pred)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(7,5))
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc,color="red",lw=1.7,marker="*",markersize=5)
plt.plot([0, 1], [0, 1],color="black",lw=3)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

# Precision-Recall Curve

In [None]:
precision, recall, thresholds = precision_recall_curve(Y_test,pred)
ap_score = average_precision_score(Y_test,pred)
plt.figure(figsize=(7,5))
plt.plot(recall, precision, label='Precision-Recall Curve (AP = %0.2f)' % ap_score,color="purple",lw=2.8)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower left")
plt.show()

In [None]:
roc_auc = roc_auc_score(Y_test, pred)
plt.plot([])
plt.text(0,0, f'ROC AUC  Score: {roc_auc:.4f}', fontsize=16, ha='center', va='center',color="indigo")
plt.axis('off')

# Set the x-axis limits
plt.xlim(-1, 1)
plt.ylim(-1,1)

plt.show()

In [None]:
logarithm_loss=log_loss(Y_test,final_pred)
plt.plot([])
plt.text(0,0, f'Log Loss: {logarithm_loss:.4f}', fontsize=16, ha='center', va='center',color="black")
plt.axis('off')

# Set the x-axis limits
plt.xlim(-1, 1)
plt.ylim(-1,1)

plt.show()

In [None]:
kappa = cohen_kappa_score(Y_test,final_pred)
plt.plot([])
plt.text(0,0, f'Cohen Kappa Score: {kappa:.4f}', fontsize=16, ha='center', va='center',color="orangered")
plt.axis('off')

# Set the x-axis limits
plt.xlim(-1, 1)
plt.ylim(-1,1)

plt.show()

In [None]:
mcc = matthews_corrcoef(Y_test, final_pred)

# Create a plot and display the MCC value as text
plt.plot([])
plt.text(0,0, f'Matthews Correlation Coefficient: {mcc:.4f}', fontsize=16, ha='center', va='center',color="saddlebrown")
plt.axis('off')

# Set the x-axis limits
plt.xlim(-1, 1)
plt.ylim(-1,1)

plt.show()


# Confusion Matrix

In [None]:
label_name = ["Non Disaster", "Disaster"]
plt.figure(figsize=(7, 5))
cm = confusion_matrix(Y_test, final_pred)
sns.heatmap(cm, annot=True, cmap="rainbow",fmt="d",lw=6,linecolor="black",xticklabels=label_name, yticklabels=label_name)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()



# Classification Report

In [None]:
print(classification_report(Y_test,final_pred,target_names=label_name))

# Test Data Analysis


In [None]:
df_test=pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [None]:
df_test.head()

In [None]:
df_test.info()

In [None]:
df_test.describe()

In [None]:
df_test.isnull().sum()

In [None]:
df_test.drop(["keyword","location"],axis=1,inplace=True)

In [None]:
df_test.head()

In [None]:
test_text=df_test["text"]

In [None]:
def xlnet_encode(final_text_data, max_len):
    input_ids = []
    attention_masks = []

    for i in range(len(final_text_data)):
        encode_data = xlnet_tokenizer.encode_plus(final_text_data[i], add_special_tokens=True, max_length=max_len, pad_to_max_length=True, return_attention_mask=True)
        input_ids.append(encode_data['input_ids'])
        attention_masks.append(encode_data["attention_mask"])

    return np.array(input_ids), np.array(attention_masks)

In [None]:
text_input_ids,text_attention_masks = xlnet_encode(test_text,max_len)

In [None]:
text_input_ids = np.array(text_input_ids)
text_attention_masks = np.array(text_attention_masks)
my_pred = model.predict([text_input_ids, text_attention_masks])
predicted_labels = np.where(my_pred >= 0.5, 1, 0)
df_test['target'] = predicted_labels


In [None]:
df_test.head(10)

In [None]:
df_test["target"].value_counts()

In [None]:
df0=df_test.copy()
df0.drop("text",axis=1,inplace=True)
df0.to_csv("my_submission.csv",index=False)