In [5]:
import pandas as pd
df=pd.read_csv('/content/drive/MyDrive/Dataset/train.csv')
df.head()
df.shape
df.info()
df['target'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,4342
1,3271


In [6]:
#Text cleaning
import re
import nltk
import string
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
stopwords=set(stopwords.words('english'))
lemmatizer=WordNetLemmatizer()

def clean_text(text):
  text=text.lower()
  text=re.sub(r'http\S+|www\S+', '', text)
  text=re.sub(r'@\w+|#\w+', '', text)
  text=re.sub(r'[^a-z\s]', '', text)
  tokens=text.split()
  tokens=[lemmatizer.lemmatize(word) for word in tokens if word not in stopwords]
  return " ".join(tokens)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [7]:
df['clean_text']=df['text'].apply(clean_text)
df[['text','clean_text']].head()

Unnamed: 0,text,clean_text
0,Our Deeds are the Reason of this #earthquake M...,deed reason may allah forgive u
1,Forest fire near La Ronge Sask. Canada,forest fire near la ronge sask canada
2,All residents asked to 'shelter in place' are ...,resident asked shelter place notified officer ...
3,"13,000 people receive #wildfires evacuation or...",people receive evacuation order california
4,Just got sent this photo from Ruby #Alaska as ...,got sent photo ruby smoke pours school


In [8]:
#text length reduction
df['text_length']=df['text'].apply(len)
df['clean_text_length']=df['clean_text'].apply(len)
df[['text_length','clean_text_length']].describe()

Unnamed: 0,text_length,clean_text_length
count,7613.0,7613.0
mean,101.037436,55.223959
std,33.781325,23.68106
min,7.0,0.0
25%,78.0,38.0
50%,107.0,55.0
75%,133.0,72.0
max,157.0,137.0


In [11]:
from sklearn.model_selection import train_test_split
x=df['clean_text']
y=df['target']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,
                                               random_state=42,stratify=y)

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfid=TfidfVectorizer(
    max_features=10000,
    ngram_range=(1,2),
    min_df=2
)
x_train_tfid=tfid.fit_transform(x_train)
x_test_tfid=tfid.transform(x_test)
x_train_tfid.shape

(6090, 8427)

In [13]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report
lr=LogisticRegression(
    max_iter=1000,
    class_weight='balanced'
)
lr.fit(x_train_tfid,y_train)
y_pred=lr.predict(x_test_tfid)
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

0.8049901510177282
              precision    recall  f1-score   support

           0       0.82      0.84      0.83       869
           1       0.78      0.76      0.77       654

    accuracy                           0.80      1523
   macro avg       0.80      0.80      0.80      1523
weighted avg       0.80      0.80      0.80      1523



In [15]:
# svm
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score,classification_report
svm=LinearSVC(class_weight='balanced')
svm.fit(x_train_tfid,y_train)
y_pred_svm=svm.predict(x_test_tfid)
print("Accuracy:",accuracy_score(y_test,y_pred_svm))
print(classification_report(y_test,y_pred_svm))

Accuracy: 0.783322390019698
              precision    recall  f1-score   support

           0       0.82      0.80      0.81       869
           1       0.74      0.77      0.75       654

    accuracy                           0.78      1523
   macro avg       0.78      0.78      0.78      1523
weighted avg       0.78      0.78      0.78      1523



In [16]:
import pandas as pd
results=pd.DataFrame({
    'text':x_test,
    'true_label':y_test,
    'pred_label':y_pred
})
false_negatives=results[(results.true_label==1) & (results.pred_label==0)]
false_negatives.head()

Unnamed: 0,text,true_label,pred_label
6837,hollywood movie trapped miner released chile h...,1,0
2905,cant drown demon know swim,1,0
1956,need plant pacific cyclone season would help,1,0
6569,dear name humanityi apologized survivorsr u ready,1,0
5020,look like mudslide poor thing,1,0


In [19]:
import joblib
joblib.dump(tfid,'../content/drive/MyDrive/Dataset/tfid.pkl')
joblib.dump(lr,"../content/drive/MyDrive/Dataset/Disaster_tweet_model.pkl")

['../content/drive/MyDrive/Dataset/Disaster_tweet_model.pkl']

In [20]:
loaded_model = joblib.load("../content/drive/MyDrive/Dataset/Disaster_tweet_model.pkl")
loaded_vectorizer = joblib.load("../content/drive/MyDrive/Dataset/tfid.pkl")

sample_text = ["Massive fire breaks out in downtown area"]
sample_vec = loaded_vectorizer.transform(sample_text)
loaded_model.predict(sample_vec)


array([1])