# Imports

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
import eli5
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import spacy


# Load data

In [125]:
train_df = pd.read_csv('train_df.csv')

In [126]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Unnamed: 0         7613 non-null   int64 
 1   id                 7613 non-null   int64 
 2   keyword            7552 non-null   object
 3   location           5080 non-null   object
 4   text               7613 non-null   object
 5   target             7613 non-null   int64 
 6   text_length        7613 non-null   int64 
 7   clean_text         7606 non-null   object
 8   clean_location     4816 non-null   object
 9   clean_keyword      7552 non-null   object
 10  clean_joined_text  7613 non-null   object
dtypes: int64(4), object(7)
memory usage: 654.4+ KB


In [4]:
test_df = pd.read_csv('data/test.csv')

In [128]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


# Vectorize the data

In [10]:
nlp = spacy.load("en_core_web_sm")

## Vectorize train data

In [158]:
MAX_LEN = 10000
vectorizer_tfidf = TfidfVectorizer(max_features=MAX_LEN, ngram_range=(1,3))

vectorized_text_train_tfidf = vectorizer_tfidf.fit(train_df['clean_joined_text'])
vectorized_text_train_tfidf = vectorizer_tfidf.transform(train_df["clean_joined_text"])

## Vectorize test data

In [11]:
# Functions for data preprocessing.

def preprocess_text(txt: str) -> str:
    """Tokenizing, lemmatizing, lowercasing, removing stopwords,
    removing non-alphanumeric chars, and removing words with len <= 2 from a single sequence.
    Returns a string of the processed and concatenated tokens.
    """
    txt = nlp(txt)
    return " ".join(
        [
            token.lemma_.lower()
            for token in txt
            if not token.is_stop and token.is_alpha and len(token) > 2
        ]
    )

def preprocess_location(txt: str) -> str:
    """Tokenizing, lemmatizing, lowercasing, removing stopwords,
    removing non-alphanumeric chars.
    Returns a string of the processed and concatenated tokens.
    """
    txt = nlp(txt)
    return " ".join(
        [token.lemma_.lower() for token in txt if not token.is_stop and token.is_alpha]
    )


def preprocess_keyword(txt: str) -> str:
    """Replacing the %20 sequence with a blankspace.
    Tokenizing, lemmatizing, lowercasing, removing stopwords,
    removing non-alphanumeric chars.
    Returns a string of the processed and concatenated tokens.
    """
    txt = nlp(txt.replace("%20", " "))
    return " ".join(
        [token.lemma_.lower() for token in txt if not token.is_stop and token.is_alpha]
    )

In [6]:
# Replace the missing values in 'keyword' and 'location' columns with an empty string.

test_df["location"] = test_df["location"].fillna("")
test_df["keyword"] = test_df["keyword"].fillna("")

In [12]:
# Preprocess the test data
 
test_df["clean_text"] = test_df["text"].apply(preprocess_text)
test_df["clean_location"] = test_df["location"].apply(preprocess_location)
test_df["clean_keyword"] = test_df["keyword"].apply(preprocess_keyword)

In [13]:
test_df['clean_joined_text'] = test_df["clean_text"] + test_df["clean_location"] + test_df["clean_keyword"]

In [159]:
vectorized_text_test_tfidf = vectorizer_tfidf.transform(test_df["clean_joined_text"])

# Train the model

In [153]:
logistic_regression_CV = LogisticRegressionCV(
    cv=5,
    random_state=0,
    solver='newton-cg',
    max_iter=1000,
    refit=False)

In [160]:
logistic_regression_CV.fit(
        vectorized_text_train_tfidf.toarray(), 
        train_df['target'])

# Get predictions

In [161]:
predictions_df = pd.DataFrame()
predictions_df['id'] = test_df['id']
predictions_df['target'] = logistic_regression_CV.predict(vectorized_text_test_tfidf.toarray())

In [162]:
predictions_df

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,0
3259,10865,1
3260,10868,1
3261,10874,1


In [164]:
predictions_df.to_csv("predictions.csv", index=False)

# TODO
* try bigrams, trigrams
* try xgboost

In [15]:
test_df.to_csv("test_df.csv")