In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
import eli5
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import spacy


In [3]:
train_df = pd.read_csv('train_df.csv')

In [4]:
MAX_LEN = 10000

In [56]:
vectorizer_tfidf = TfidfVectorizer(max_features=MAX_LEN)

vectorized_text_tfidf = vectorizer_tfidf.fit(train_df["clean_joined_text"])
vectorized_text_tfidf = vectorizer_tfidf.transform(train_df["clean_joined_text"])

In [45]:
logistic_regression_CV = LogisticRegressionCV(
    cv=5,
    random_state=0,
    solver='newton-cg',
    max_iter=1000).fit(
        vectorized_text_tfidf.toarray(), 
        train_df['target'])

# Get predictions

In [9]:
test_df = pd.read_csv('data/test.csv')

In [10]:
test_df

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [28]:
# Replace the missing values with an empty string.

test_df["location"] = test_df["location"].fillna("")
test_df["keyword"] = test_df["keyword"].fillna("")

In [15]:
nlp = spacy.load("en_core_web_sm")

In [24]:
def preprocess_text(txt: str) -> str:
    """Tokenizing, lemmatizing, lowercasing, removing stopwords,
    removing non-alphanumeric chars, and removing words with len <= 2 from a single sequence.
    Returns a string of the processed and concatenated tokens.
    """
    txt = nlp(txt)
    return " ".join(
        [
            token.lemma_.lower()
            for token in txt
            if not token.is_stop and token.is_alpha and len(token) > 2
        ]
    )

def preprocess_location(txt: str) -> str:
    """Tokenizing, lemmatizing, lowercasing, removing stopwords,
    removing non-alphanumeric chars.
    Returns a string of the processed and concatenated tokens.
    """
    txt = nlp(txt)
    return " ".join(
        [token.lemma_.lower() for token in txt if not token.is_stop and token.is_alpha]
    )


def preprocess_keyword(txt: str) -> str:
    """Replacing the %20 sequence with a blankspace.
    Tokenizing, lemmatizing, lowercasing, removing stopwords,
    removing non-alphanumeric chars.
    Returns a string of the processed and concatenated tokens.
    """
    txt = nlp(txt.replace("%20", " "))
    return " ".join(
        [token.lemma_.lower() for token in txt if not token.is_stop and token.is_alpha]
    )

In [29]:
test_df["clean_text"] = test_df["text"].apply(preprocess_text)
test_df["clean_location"] = test_df["location"].apply(preprocess_location)
test_df["clean_keyword"] = test_df["keyword"].apply(preprocess_keyword)

In [21]:
indices = np.random.randint(0, high=len(test_df.text), size=10)


print(
    """
    Before preprocessing: \n
    %s \n
    After preprocessing: \n 
    %s
    """
    % (str([test_df["text"].iloc[i] for i in indices]),
    str([test_df["clean_text"].iloc[i] for i in indices])
    )
)


    Before preprocessing: 

    ["@Alltheway80s I had a similar thing with John carpenters 'the thing' one girl threw up when the dogs 'exploded'", 'Bit-Defender hack \x89ÛÒ Held Hostage!: Late last week it was discovered that antivirus vendor ... http://t.co/2vC8CSTWy5 #damballa #infosec', "@lizjillies it's such a tongue twister", '#OVERPOPULATION Not only R women incapable of keeping their legs together 2 save the world from endless brats they want to RUN the disaster', '#WorldNews #World\n Saipan Has No Water Electricity in Typhoon Aftermath - Voice of America - World - Google News.. http://t.co/5sUdXgNdA3', 'RT Karnythia: Another #ErasureIsNotEquality example? Movies like The Impossible about a natural disaster in Thailand that focuses on white \x89Û_', "I can't believe it never occurred to me that I could *not* be deluged with Kickstarter emails.", "@dmon2112 @C_T_Morgan but the fire rings of NYC permits I'd have to jump through for a food truck don't make it appealing", 'West Si

In [60]:
test_df["clean_joined_text"] = (
    test_df["clean_text"] + test_df["clean_location"] + test_df["clean_keyword"]
)

In [61]:
vectorized_text_tfidf_test = vectorizer_tfidf.transform(test_df["clean_joined_text"])

In [62]:
predictions_df = pd.DataFrame()
predictions_df['id'] = test_df['id']
predictions_df['target'] = logistic_regression_CV.predict(vectorized_text_tfidf_test.toarray())

In [63]:
predictions_df

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [64]:
predictions_df.to_csv("predictions.csv", index=False)