# Proeycto de IA - ChatBox

Departamento de Ciencias de la Computación, Universidad de Chile.

CC6409: Taller de Desarrollo de Poryectod e IA - Otoño 2023

**Integrantes:**
- Garrido Martín
- Gómez Nahuel
- Santelices Gustavo


## Import

In [56]:
import pandas as pd
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, cohen_kappa_score, classification_report

from sklearn.base import BaseEstimator, TransformerMixin
import re
import numpy as np

## data

In [28]:
# handmade
hand_nawel_0 = pd.read_csv("https://raw.githubusercontent.com/QwagPerson/ProyectoDeIA/main/Data/Handmade/asistire_nawel.csv", sep = ';')
hand_nawel_1 = pd.read_csv("https://raw.githubusercontent.com/QwagPerson/ProyectoDeIA/main/Data/Handmade/no_asistire_nawel.csv", sep = ';')
hand_nawel_2 = pd.read_csv("https://raw.githubusercontent.com/QwagPerson/ProyectoDeIA/main/Data/Handmade/reagendar_nawel.csv", sep = ';')
hand_nawel_3 = pd.read_csv("https://raw.githubusercontent.com/QwagPerson/ProyectoDeIA/main/Data/Handmade/pedir_nawel.csv", sep = ';')

hand_tavo_0 = pd.read_csv("https://raw.githubusercontent.com/QwagPerson/ProyectoDeIA/main/Data/Handmade/asistire_tavo.csv", sep = ',')
hand_tavo_1 = pd.read_csv("https://raw.githubusercontent.com/QwagPerson/ProyectoDeIA/main/Data/Handmade/no_asistire_tavo.csv", sep = ',')

In [29]:
# GPT
gpt_nawel_0 = pd.read_csv("https://raw.githubusercontent.com/QwagPerson/ProyectoDeIA/main/Data/GPT_generated/asistire_nawel.csv", sep = ';')
gpt_nawel_1 = pd.read_csv("https://raw.githubusercontent.com/QwagPerson/ProyectoDeIA/main/Data/GPT_generated/no_asistire_nawel.csv", sep = ';')
gpt_nawel_2 = pd.read_csv("https://raw.githubusercontent.com/QwagPerson/ProyectoDeIA/main/Data/GPT_generated/reagendar_nawel.csv", sep = ';')
gpt_nawel_3 = pd.read_csv("https://raw.githubusercontent.com/QwagPerson/ProyectoDeIA/main/Data/GPT_generated/pedir_nawel.csv", sep = ';')


In [30]:
df_ = pd.concat([hand_nawel_0, hand_nawel_1, hand_nawel_2, hand_nawel_3,
                 gpt_nawel_0, gpt_nawel_1, gpt_nawel_2, gpt_nawel_3,
                 hand_tavo_0, hand_tavo_1])

df_.count()

class    230
text     230
dtype: int64

In [31]:
df_['class'].value_counts()

0    79
1    78
2    37
3    36
Name: class, dtype: int64

## Custom Features

In [62]:
class PreProccesingTransformer(BaseEstimator, TransformerMixin):
    def preprocess(self,sentence):
      # Deleting all except: exclamation/question signs and accents
      new_word = re.sub(r"[^a-zA-ZáéíóúÁÉÍÓÚñÑ¡!¿?\s]", '', sentence)
      # Deleting double blank spaces
      new_sentence = new_word.replace('  ',' ').replace('\n','').strip()
      return new_sentence

    def transform(self, X, y=None):
        values = []
        for tweet in X:
            values.append(self.preprocess(tweet))

        return(np.array(values))

    def fit(self, X, y=None):
        return self

## Pipeline

### Run Pipeline

In [51]:
def run(dataset, pipeline):

    X_train, X_test, y_train, y_test = train_test_split(
        dataset['text'],
        dataset['class'],
        shuffle=True,
        test_size=0.2,
        random_state=42,
        stratify=dataset['class']
    )

    print(f"# Len Training Data: {len(X_train)}")
    print(f"# Len Testing Data: {len(X_test)}")

    pipeline.fit(X_train, y_train)

    predicted_labels = pipeline.predict(X_test)

    print(classification_report(y_test, predicted_labels))


    return pipeline, X_train, X_test, y_train, y_test, predicted_labels


## Defining Pipelines

In [33]:
# simple Pipeline only BOW

def get_experiment_0_pipeline():

    return Pipeline(
        [
            (
                "features",
                FeatureUnion(
                    [
                        ("bow", CountVectorizer()),
                    ]
                ),
            ),
            ("clf", MultinomialNB()),
        ]
    )

In [60]:
# simple Pipeline only Preprocessing + BOW

def get_experiment_1_pipeline():

    return Pipeline(
        [
            ("preprocessing", PreProccesingTransformer()),
            (
                "features",
                FeatureUnion(
                    [
                        ("bow", CountVectorizer())
                    ]
                ),
            ),
            ("clf", MultinomialNB()),
        ]
    )

## Implementación del modelo

In [52]:
# Simple Pipeline Test
pipeline0 = get_experiment_0_pipeline()

_ = run(df_, pipeline0)

# Len Training Data: 184
# Len Testing Data: 46
              precision    recall  f1-score   support

           0       0.88      0.88      0.88        16
           1       0.78      0.88      0.82        16
           2       1.00      0.57      0.73         7
           3       0.88      1.00      0.93         7

    accuracy                           0.85        46
   macro avg       0.88      0.83      0.84        46
weighted avg       0.86      0.85      0.84        46



In [63]:
# Simple Pipeline Test (+ preprocessing)
pipeline1 = get_experiment_1_pipeline()

_ = run(df_, pipeline1)

# Len Training Data: 184
# Len Testing Data: 46
              precision    recall  f1-score   support

           0       0.88      0.88      0.88        16
           1       0.78      0.88      0.82        16
           2       1.00      0.57      0.73         7
           3       0.88      1.00      0.93         7

    accuracy                           0.85        46
   macro avg       0.88      0.83      0.84        46
weighted avg       0.86      0.85      0.84        46

