In [None]:
#Load the packages
import pandas as pd #we use this to load, read and transform the dataset
import numpy as np #we use this for statistical analysis

#To visualize the dataset
import matplotlib.pyplot as plt #we use this to visualize the dataset

#To test the models
import sklearn.metrics as sklm 
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

#Text cleaning and analyzers
from string import punctuation
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize

#Models
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression,SGDClassifier, LinearRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from time import time

data = dataset

# making list stopwords for removing stopwords from our text 
stop = set(stopwords.words('english'))
stop.update(punctuation)

# this function return the part of speech of a word.
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
# Function to clean our text.
lemmatizer = WordNetLemmatizer()
def clean_review(text):
    clean_text = []
    for w in word_tokenize(text):
        if w.lower() not in stop:
            pos = pos_tag([w])
            new_w = lemmatizer.lemmatize(w, pos=get_simple_pos(pos[0][1]))
            clean_text.append(new_w)
    return clean_text

def join_text(text):
    return " ".join(text)

data['Newsline'] = data['Newsline'].apply(clean_review)
data['Newsline'] = data['Newsline'].apply(join_text)

# splitting data.
x_train,x_test,y_train,y_test = train_test_split(data.Newsline,data.Sentiment,test_size = 0.2 , random_state = 0)

pos = x_train[y_train[y_train=='positive'].index]
neg = x_train[y_train[y_train=='negative'].index]
neutral = x_train[y_train[y_train=='neutral'].index]

from sklearn.feature_extraction.text import TfidfVectorizer
tvec = TfidfVectorizer(max_features=10000, ngram_range=(1,2), max_df=0.9, min_df=0)

X = tvec.fit_transform(data.Newsline).todense()
x_train_features = tvec.fit_transform(x_train).todense()
x_test_features = tvec.transform(x_test).todense()

lr = LogisticRegression()
lr.fit(x_train_features, y_train)
y_pred = lr.predict(X)
y_prob = lr.predict_proba(X)[:,1]

data['Predictions'] = y_pred
data['Probabilities'] = y_prob

#Making the coefficients 
b0 = 0
b1 = 0

prediction = 1 / (1 + np.exp(-(b0+ b1*1.0)))

x1 = X.mean()
 

label = 0
coef_intercept = b0 + 0.3 * (label - prediction) * prediction * (1 - prediction) * 1.0
coef_newsline = b1 + 0.3 * (label - prediction) * prediction * (1 - prediction) * x1

components = pd.DataFrame({'component': ["newsline", 'intercept'], 'value': [coef_newsline, coef_intercept]})

