In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rober\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Rober\AppData\Roaming\nltk_data...


True

In [14]:
# Data preparation
data = pd.read_csv('data.csv')
X = data['text']
y = data['target']
X, X_val, y, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

data.head

<bound method NDFrame.head of         target                                               text
0            0  this is my last tweet of the day, so goodnight...
1            1  @vanessaparlo ahaha okay  yeah fer suree! i'll...
2            0  I'm gonna feel like shit at uni today, I'm sti...
3            0  can't find my phone charger.. So I'm switching...
4            0  @DottiAwesome It was just fade-to-black though...
...        ...                                                ...
499995       1                    Watching twilight ) teeeext me 
499996       1  @estoni for me it is a usual day for me fighti...
499997       0  @KeepEmCookin Question for you... can stress r...
499998       0                           okay then u made me sad 
499999       0  What about with me?  @lyndseyfree  now I'm sad...

[500000 rows x 2 columns]>

In [20]:
# make everything lowercase
def lower_case(text):
    return text.str.lower()

# remove punctuation
def remove_punctuation(text):
    return text.str.replace(r'[^\w\s]', '')

# remove stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    return text.apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# stemming
def stemming(text):
    stemmer = PorterStemmer()
    return text.apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

# lemmatization
def lemmatization(text):
    lemmatizer = WordNetLemmatizer()
    return text.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

# remove @names
def remove_at(text):
    return text.str.replace(r'@\w+', '')

# preprocess the data
def preprocess_data(X):
    X = lower_case(X)
    X = remove_punctuation(X)
    X = remove_stopwords(X)
    X = stemming(X)
    X = lemmatization(X)
    X = remove_at(X)
    return X


X = preprocess_data(X)
X_val = preprocess_data(X_val)

X.head()


269056    @lamadsterr - post complaint teacher www.teach...
499174                     oh god love sleep good afternoon
85143     @silver_m live melbourne, would love move amer...
260335                                           wait 4 ppl
338124          best birthday girl oh, got jumpsuit!! haha!
Name: text, dtype: object

In [22]:
# Tokenization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)
X_val = vectorizer.transform(X_val)


AttributeError: 'csr_matrix' object has no attribute 'lower'

In [25]:
# Model training
model = make_pipeline(StandardScaler(with_mean=False), LogisticRegression())
model.fit(X, y)


In [4]:
# Model evaluation
y_pred = model.predict(X_val)
print(accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))