# Contents

In [1]:
# data: basic clean 
# Vectorizations using bag of words technique 
# vectorizer from sklearn CountVectorizer
# Model Classifier: Support Vector Classifier 

# Setup

In [1]:
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score

# Raw dataset

In [3]:
df = pd.read_csv('../../data/processed/basic_clean.csv', index_col=0)

In [4]:
df

Unnamed: 0,textID,text,sentiment
0,cb774db0d1,would responded going,neutral
1,549e992a42,sooo sad miss san diego,negative
2,088c60f138,boss bullying,negative
3,9642c003ef,interview leave alone,negative
4,358bd9e861,sons could put releases already bought,negative
...,...,...,...
27476,4eac33d1c0,wish could see denver husband lost job afford,negative
27477,4f4c4fc327,wondered rake client made clear net force devs...,negative
27478,f67aae2310,yay good enjoy break probably need hectic week...,positive
27479,ed167662a5,worth,positive


# Removing nan cells

In [5]:
df = df.dropna()

In [None]:
# variables
X = df['text']
Y = df['sentiment']
# Train test split
X_train, x_test, Y_train, y_test = train_test_split(X, Y, random_state=42)

with mlflow.start_run(run_name='suport_vector_clasifier'):
    mlflow.log_param('file name', filename.split("/")[-1])
    try:
        
        count_vectorizer = CountVectorizer(stop_words='english', strip_accents='ascii', lowercase=True, max_features=25000)
        svm = SVC(probability=True)
    except ValueError: 
        logging.log(40," Error encountered in model instanciation.".format(item))
    
    logging.info(" Model instances created")
    # creating a pipeline
    pipe = Pipeline(steps = [('vectorizer', count_vectorizer),
                             ('svm_model', svm)])
    
    logging.info(" Pipeline created.")
    
    pipe.fit(X_train, Y_train)
    log_params = [mlflow.log_param(key, value) for key, value in pipe.get_params().items() if '__' in key]
    logging.info(" Logged params.")
    
    # Getting predicions 
    predictions = pipe.predict(x_test)
    logging.info(" Predictions out.")
    
    # metrics 
    metrics_ = {'accuracy': accuracy_score,
                'balanced_accuracy': metrics.balanced_accuracy_score,
              #  'average_precision': metrics.average_precision_score,
              #  'precission': metrics.precision_score,
                'recall': metrics.recall_score,
               }
    
    
    for metric, func in metrics_.items():
        try:
            result = func(y_test, predictions)
            print(f"{metric} = {result}")
            mlflow.log_metric(metric, result)
        except ValueError:
            logging.log(40, f"Not logged metric {metric}.")
            pass
        
    
    logging.info(" Metrics logged.")

In [17]:
real_pred = pd.DataFrame({'y_true':y_test, 'y_pred':predictions})

In [18]:
real_pred

Unnamed: 0,y_true,y_pred
14833,neutral,neutral
14586,negative,neutral
17108,neutral,neutral
20881,negative,neutral
3391,neutral,neutral
...,...,...
4223,negative,negative
12488,negative,negative
26978,positive,positive
21778,neutral,neutral


# Metrics and model evaluation

In [19]:
print("Accuracy archived: {0} %".format( round(accuracy_score(real_pred['y_true'],real_pred['y_pred']), 2)))

Accuracy archived: 0.69 %


In [20]:
print(classification_report(real_pred['y_true'],real_pred['y_pred']))

              precision    recall  f1-score   support

    negative       0.79      0.53      0.63      1970
     neutral       0.61      0.82      0.70      2737
    positive       0.79      0.69      0.73      2143

    accuracy                           0.69      6850
   macro avg       0.73      0.68      0.69      6850
weighted avg       0.72      0.69      0.69      6850



In [21]:
pd.DataFrame(confusion_matrix(real_pred['y_true'],real_pred['y_pred']), index=[('Actuals','Negative'), ('Actuals','Neutral'), ('Actuals','Positive')], 
            columns=[('predicted','Negative'), ('predicted','Neutral'), ('predicted','Positive')])

Unnamed: 0,"(predicted, Negative)","(predicted, Neutral)","(predicted, Positive)"
"(Actuals, Negative)",1038,816,116
"(Actuals, Neutral)",229,2238,270
"(Actuals, Positive)",46,629,1468
