In [53]:
import pandas as pd
from sklearn.model_selection import train_test_split

ratingCol = "rating"
messageCol = "message"

# Read the data file
data = pd.read_csv("./a3_train_round1.tsv", names=[ratingCol, messageCol], sep="\t").dropna()

data[messageCol] = data[messageCol].apply(lambda msg: msg.lower())

# Shuffle the dataset.
data = data.sample(frac=1.0, random_state=0)

# Split into input part X and output part Y.
X = data[messageCol]
Y = data[ratingCol]

# Partition the data into training and test sets.
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=0)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

pipeline = make_pipeline(TfidfVectorizer(), LinearSVC())
pipeline.fit(Xtrain, Ytrain)

print("Accuracy: %.1f %%" % (accuracy_score(Ytest, pipeline.predict(Xtest)) * 100))


Accuracy: 75.5 %


In [47]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

ratingCol = "rating"
messageCol = "message"

# Read the data file
data = pd.read_csv("./a3_train_round1.tsv", names=[ratingCol, messageCol], sep="\t").dropna()

# data[ratingCol] = data[ratingCol].apply(lambda score: "+" if score == 1 else "-")

# Shuffle the dataset.
data = data.sample(frac=1.0, random_state=0)

# Split into input part X and output part Y.
X = data[messageCol]
data_labels = data[ratingCol]

vectorizer = CountVectorizer(
    analyzer='word',
    lowercase=False,
)
features = vectorizer.fit_transform(
    X
)
features_nd = features.toarray()

# Partition the data into training and test sets.
Xtrain, Xtest, Ytrain, Ytest = train_test_split(features_nd, data_labels, test_size=0.2, random_state=0)

from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression(max_iter=1000)

log_model = log_model.fit(X=Xtrain, y=Ytrain)

print("Accuracy: %.1f %%" % (accuracy_score(Ytest, log_model.predict(Xtest)) * 100))


Accuracy: 74.4 %
