In [1]:
import pandas as pd

df = pd.read_csv('./news_data.csv', encoding='ISO-8859-1', header=None, names=['sentiment', 'text'])

import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    words = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if not word in stop_words]
    porter = PorterStemmer()
    stemmed_words = [porter.stem(word) for word in words]
    preprocessed_text = ' '.join(stemmed_words)
    return preprocessed_text

df['text'] = df['text'].apply(preprocess_text)

df

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,sentiment,text
0,neutral,accord gran compani plan move product russia a...
1,neutral,technopoli plan develop stage area less squar ...
2,negative,intern electron industri compani elcoteq laid ...
3,positive,new product plant compani would increas capac ...
4,positive,accord compani updat strategi year baswar targ...
...,...,...
4841,negative,london marketwatch share price end lower londo...
4842,neutral,rinkuskiai beer sale fell per cent million lit...
4843,negative,oper profit fell eur mn eur mn includ vessel s...
4844,negative,net sale paper segment decreas eur mn second q...


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['sentiment'], test_size=0.3, random_state=42)

tfidf = TfidfVectorizer(min_df=5, ngram_range=(1, 2), tokenizer=nltk.word_tokenize, sublinear_tf=True)

X_train_tfidf = tfidf.fit_transform(X_train).toarray()

X_test_tfidf = tfidf.transform(X_test).toarray()



In [15]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

svm = SVC(kernel='linear', C=1, gamma=1)
svm.fit(X_train_tfidf, y_train)
svm_pred = svm.predict(X_test_tfidf)
print(classification_report(y_test, svm_pred))

              precision    recall  f1-score   support

    negative       0.71      0.54      0.61       179
     neutral       0.76      0.91      0.83       847
    positive       0.75      0.52      0.62       428

    accuracy                           0.75      1454
   macro avg       0.74      0.66      0.68      1454
weighted avg       0.75      0.75      0.74      1454



In [16]:
import joblib

joblib.dump(svm, 'model.pkl')

['model.pkl']