In [1]:
# In this section we will be using Latent Semantic Analysis (LSA). LSA means to correlate the meaning - that is the topic - of the words
# to the output. As such, it requires an analysis of the context of the statement compared to the words used.
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn import *

In [2]:
# Get data
test_df = pd.read_csv("../0-Data/test.csv")
train_df = pd.read_csv("../0-Data/train.csv")

train_df.head()
train_df["text"]

0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       Just got sent this photo from Ruby #Alaska as ...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @TheTawniest The out of control w...
7610    M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611    Police investigating after an e-bike collided ...
7612    The Latest: More Homes Razed by Northern Calif...
Name: text, Length: 7613, dtype: object

In [3]:
# Pre-processing
def simple_preprocessing(text):
    text = re.sub('[$(.%),;!?]+','', text) #Remove common punctuation
    return text

new_train_df = [simple_preprocessing(txt.lower()) for txt in train_df["text"]]

In [4]:
# Make into vectors
stopword_list = stopwords.words('english')
vector = feature_extraction.text.CountVectorizer(max_df=0.7, min_df=5, token_pattern="[a-z']+", 
                      stop_words=stopword_list, max_features=2000) # Only top 2000 most frequently ocurring words are considered
train_vec = vector.fit_transform(new_train_df)
#train_vec = vector.fit_transform(train_df["text"])
test_vec = vector.transform(test_df["text"])

In [5]:
# Set up single value decomposition (SVD)
tsvd = decomposition.TruncatedSVD(n_components=train_vec.shape[1])
tsvd.fit(train_vec)
tsvd_mat = tsvd.transform(train_vec)

In [6]:
# Build model
clf = linear_model.RidgeClassifier()
scores = model_selection.cross_val_score(clf, tsvd_mat, train_df["target"], cv=3, scoring="f1")
scores

array([0.5961945 , 0.55705636, 0.6167693 ])