In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer

In [2]:
from sklearn.naive_bayes import MultinomialNB


train_data = pd.read_csv('train.tsv',sep='\t', header=None, names=["id", "label", "statement", "subject(s)", "speaker","speaker's job title", "state info", "party affiliation", "barely true counts", "false counts","half true counts", "mostly true counts", "pants on fire counts", "context"])
test_data = pd.read_csv('test.tsv',sep='\t', header=None, names=["id", "label", "statement", "subject(s)", "speaker","speaker's job title", "state info", "party affiliation", "barely true counts", "false counts","half true counts", "mostly true counts", "pants on fire counts", "context"])

train_data['label'] = train_data['label'].replace(['pants-fire', 'barely-true','false'], 0)
train_data['label'] = train_data['label'].replace(['half-true', 'mostly-true','true'], 1)


test_data['label'] = test_data['label'].replace(['pants-fire', 'barely-true','false'], 0)
test_data['label'] = test_data['label'].replace(['half-true', 'mostly-true','true'], 1)



In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer

stop_words = set(stopwords.words('english'))

sid = SentimentIntensityAnalyzer()
def analyze_sentiment(text):
    words = word_tokenize(text)
   
    # filtered_words = [word for word in words if word.lower() not in stop_words]
    filtered_text = ' '.join(words)

    sentiment_scores = sid.polarity_scores(filtered_text) 
    return sentiment_scores['compound'] +1

train_data['sentiment'] = train_data['statement'].apply(analyze_sentiment)

test_data['sentiment'] = test_data['statement'].apply(analyze_sentiment)

train_features = train_data[['sentiment', 'statement']]

test_features = test_data[['sentiment', 'statement']]

y_train = train_data['label']

y_test = test_data['label']

train_features.head()

Unnamed: 0,sentiment,statement
0,1.25,Says the Annies List political group supports ...
1,1.3612,When did the decline of coal start? It started...
2,1.3182,"Hillary Clinton agrees with John McCain ""by vo..."
3,1.7579,Health care reform legislation is likely to ma...
4,1.0,The economic turnaround started at the end of ...


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse

vectorizer = TfidfVectorizer()
X_train_statement = vectorizer.fit_transform(train_features['statement'])
X_train_sentiment = sparse.csr_matrix(train_features['sentiment'].values.reshape(-1, 1))
X_train_features = sparse.hstack((X_train_statement, X_train_sentiment))




In [10]:
nb_combined_model = MultinomialNB()
nb_combined_model.fit(X_train_features, y_train)

nb_statement_model = MultinomialNB()
nb_statement_model.fit(X_train_statement, y_train)

nb_sentiment_model = MultinomialNB()
nb_sentiment_model.fit(X_train_sentiment, y_train)

In [11]:
valid_data = pd.read_csv('valid.tsv',sep='\t', header=None, names=["id", "label", "statement", "subject(s)", "speaker","speaker's job title", "state info", "party affiliation", "barely true counts", "false counts","half true counts", "mostly true counts", "pants on fire counts", "context"])

valid_data['label'] = valid_data['label'].replace(['pants-fire', 'barely-true','false'], 0)
valid_data['label'] = valid_data['label'].replace(['half-true', 'mostly-true','true'], 1)

valid_data['sentiment'] = valid_data['statement'].apply(analyze_sentiment)
valid_features = valid_data[['sentiment', 'statement']]
y_valid = valid_data['label']

In [12]:
X_valid_statement = vectorizer.transform(valid_data['statement'])
X_valid_sentiment = sparse.csr_matrix(valid_data['sentiment'].values.reshape(-1, 1))
X_valid_features = sparse.hstack((X_valid_statement, X_valid_sentiment))



In [13]:
from sklearn.metrics import accuracy_score
y_combined_pred = nb_combined_model.predict(X_valid_features)
accuracy = accuracy_score(valid_data['label'], y_combined_pred)
print("Accuracy for combined features:", accuracy)

y_statement_pred = nb_statement_model.predict(X_valid_statement)
accuracy = accuracy_score(valid_data['label'], y_statement_pred)
print("Accuracy for statement features:", accuracy)

y_sentiment_pred = nb_sentiment_model.predict(X_valid_sentiment)
accuracy = accuracy_score(valid_data['label'], y_sentiment_pred)
print("Accuracy for sentiment features:", accuracy)

Accuracy for combined features: 0.5973520249221184
Accuracy for statement features: 0.6012461059190031
Accuracy for sentiment features: 0.5202492211838006
