In [54]:
# The following is modeled after the tutorial for the competition at: https://www.kaggle.com/code/philculliton/nlp-getting-started-tutorial/notebook
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import feature_extraction, feature_selection, linear_model, model_selection, preprocessing, pipeline, decomposition, naive_bayes
from nltk import tokenize, classify, download

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
dataframes = {}

for dirname, _, filenames in os.walk('./data'):
    for filename in filenames:
        filepath = os.path.join(dirname, filename)
        dataframes[filename] = pd.read_csv(filepath)
        print(filepath)

sample = dataframes['sample_submission.csv']
train = dataframes['train.csv']
test = dataframes['test.csv']

print(train[train["target"] == 0]["text"].values[0])
print(train[train["target"] == 1]["text"].values[0])

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

./data/sample_submission.csv
./data/test.csv
./data/train.csv
What's up man?
Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all


In [47]:
# Data stats
print('# of normal tweets:',train[train["target"] == 0]["text"].count())
print('# of disaster tweets:',train[train["target"] == 1]["text"].count())

# of normal tweets: 4342
# of disaster tweets: 3271


In [48]:
# Get cross-validated scores
def getScores(clf,train_vectors,train):
    scores = model_selection.cross_val_score(clf, train_vectors, train["target"], cv=5, scoring="f1")
    return scores

In [49]:
# Using a Count Vectorizer and Ridge Classifier
count_vectorizer = feature_extraction.text.CountVectorizer()

## Get counts for the data
train_vectors = count_vectorizer.fit_transform(train["text"])

## note that we're NOT using .fit_transform() here. Using just .transform() makes sure
# that the tokens in the train vectors are the only ones mapped to the test vectors - 
# i.e. that the train and test vectors use the same set of tokens.
test_vectors = count_vectorizer.transform(test["text"])

## Our vectors are really big, so we want to push our model's weights
## toward 0 without completely discounting different words - ridge regression 
## is a good way to do this.
clf = linear_model.RidgeClassifier()

print(getScores(clf,train_vectors,train))

# Using TFID Vectorizer and Ridge Classifier
tfid_vectorizer = feature_extraction.text.TfidfVectorizer()

## Get counts for the data
train_vectors = tfid_vectorizer.fit_transform(train["text"])

test_vectors = tfid_vectorizer.transform(test["text"])

clf = linear_model.RidgeClassifier()
print(getScores(clf,train_vectors,train))

# Using Hashing Vectorizer and  Ridge Classifier
hashing_vectorizer = feature_extraction.text.HashingVectorizer()

## Get counts for the data
train_vectors = hashing_vectorizer.transform(train["text"])

test_vectors = hashing_vectorizer.transform(test["text"])

clf = linear_model.RidgeClassifier()
print(getScores(clf,train_vectors,train))

# Using TFID Vectorizer and SGD Classifier
tfid_vectorizer = feature_extraction.text.TfidfVectorizer()

## Get counts for the data
train_vectors = tfid_vectorizer.fit_transform(train["text"])

test_vectors = tfid_vectorizer.transform(test["text"])

clf = linear_model.SGDClassifier(loss="log")
print(getScores(clf,train_vectors,train))

# create a function for the tweet tokenizer from NLTK
def tok(text):
    tt = tokenize.TweetTokenizer()
    return tt.tokenize(text)

# Using TFID Vectorizer and SGD Classifier w/ Tweet Tokenizer
tfid_vectorizer = feature_extraction.text.TfidfVectorizer(tokenizer=tok)

## Get counts for the data
train_vectors = tfid_vectorizer.fit_transform(train["text"])

test_vectors = tfid_vectorizer.transform(test["text"])

clf = linear_model.SGDClassifier(loss="log")
print(getScores(clf,train_vectors,train))

# Using TFID Vectorizer and SGD Classifier w/ Tweet Tokenizer and SelectKBest Feature Selector
tfid_vectorizer = feature_extraction.text.TfidfVectorizer(tokenizer=tok)
chi2_selector = feature_selection.SelectKBest(feature_selection.chi2,k=1000)

## Get counts for the data
train_vectors = chi2_selector.fit_transform(tfid_vectorizer.fit_transform(train["text"]),train["target"])

test_vectors = chi2_selector.transform(tfid_vectorizer.transform(test["text"]))

clf = linear_model.SGDClassifier(loss="log")
print(getScores(clf,train_vectors,train))

[0.6025641  0.50168919 0.56985004 0.50781969 0.67275495]
[0.62962963 0.55507372 0.64457332 0.59444444 0.72337043]
[0.62969925 0.56480687 0.64860427 0.59332732 0.72684458]
[0.6324473  0.59252669 0.64869419 0.59418932 0.73127036]
[0.63377609 0.58480565 0.6366782  0.58790698 0.73717443]
[0.62901655 0.67093236 0.66725198 0.62200957 0.75963905]


In [68]:
# Sentiment analysis baseline
from nltk.sentiment import SentimentIntensityAnalyzer
download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

def is_positive(tweet: str) -> int:
    # compound refers to overall positive/negative sentiment
    scores = sia.polarity_scores(tweet)
    #if sia.polarity_scores(tweet)["compound"] > 0:
    if scores['neg'] <= scores['pos']:
        return 0
    else:
        return 1

# Evaluating training data
predictions =  train['text'].apply(is_positive)
correct_ratio = np.where(train['target'] == predictions, 1, 0).sum()/train.shape[0]
print(f'Correctly identified training cases {correct_ratio}')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/niklasz/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Correctly identified training cases 0.574412189675555


In [50]:
# Fit data using Ridge Classifier and training data
# Set predictions in sample's target space
clf.fit(train_vectors, train["target"])
sample["target"] = clf.predict(test_vectors)
# sample["text"] = test["text"]
print(sample)
print(test)

         id  target
0         0       1
1         2       0
2         3       1
3         9       0
4        11       1
...     ...     ...
3258  10861       1
3259  10865       0
3260  10868       1
3261  10874       1
3262  10875       0

[3263 rows x 2 columns]
         id keyword location  \
0         0     NaN      NaN   
1         2     NaN      NaN   
2         3     NaN      NaN   
3         9     NaN      NaN   
4        11     NaN      NaN   
...     ...     ...      ...   
3258  10861     NaN      NaN   
3259  10865     NaN      NaN   
3260  10868     NaN      NaN   
3261  10874     NaN      NaN   
3262  10875     NaN      NaN   

                                                   text  
0                    Just happened a terrible car crash  
1     Heard about #earthquake is different cities, s...  
2     there is a forest fire at spot pond, geese are...  
3              Apocalypse lighting. #Spokane #wildfires  
4         Typhoon Soudelor kills 28 in China and Taiwan  
..

In [51]:
# Creates a .csv file with the predictons
sample.to_csv("submission.csv", index=False)