In [None]:
# Import
# Basics
import pandas as pd
import numpy as np
import re

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Vectorize data
from sklearn.feature_extraction.text import TfidfVectorizer

# Data resampling
from sklearn.utils import resample

# Splitting data
from sklearn.model_selection import train_test_split

# Coding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder

# Scaling
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# Parameter selection
from sklearn.feature_selection import f_regression, pearsonr, mutual_info_regression, SelectKBest

# Models
# Classification
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

# Regression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor

# Model Optmization
from sklearn.model_selection import GridSearchCV

# Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.metrics import mean_squared_error, r2_score

# Save the model
from pickle import dump

In [None]:
# Get data
df = pd.read_csv("url")

# Configure pandas to display all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Initial Data Display
df.head()

In [None]:
# DataFrame Dimensions
df.shape

In [None]:
# Basic Information on Data Types and Non-Null Values
df.info()

In [None]:
# Fill missing values
df["var1"] = df["var1"].fillna(0)

In [None]:
# Search and Removal of Duplicate Data
duplicates = df.duplicated().sum()

# If Necessary
df = df.drop_duplicates()

In [None]:
# Initial Selection of Relevant Features
# ...

In [None]:
# Analysis of Categorical Variables
# If Necessary
df["feature"].value_counts()

# Graphic analysis
fig, axis = plt.subplots(4, 2, figsize = (10, 15))

sns.countplot(ax = axis[0, 0], data = df, x = "var1", palette='pastel', hue= "var1", legend=False)

# Rotation
axis[0, 0].tick_params(axis='x', rotation=45)

sns.countplot(ax = axis[0, 1], data = df, x = "var2", palette='pastel', hue= "var2", legend=False)

sns.countplot(ax = axis[1, 0], data = df, x = "var3", palette='pastel', hue= "var3", legend=False)

sns.countplot(ax = axis[1, 1], data = df, x = "var4", palette='pastel', hue= "var4", legend=False)

# ......

# Delete axis
axis[3, 1].axis("off")

# Horizontal Grid
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Adjust the Layout
plt.tight_layout()

# Display the Plot
plt.show()

# Function for Individual Variable Analysis in a Single Plot
def graph_feature(feature, rotation=False):
    plt.figure(figsize=(4, 4))
    ax = sns.countplot(data = df, x = feature, palette='pastel', hue= feature, legend=False)
    # Horizontal Grid
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    # Display the Values on Top of Each Bar
    for p in ax.patches:
        ax.text(p.get_x() + p.get_width() / 2., 
                p.get_height() + 0.5, 
                f'{int(p.get_height())}', 
                ha='center', 
                va='bottom')
    if rotation:
        plt.xticks(rotation=45)
    # Adjust the Layout
    plt.tight_layout()
    # Display the Plot
    plt.show()

In [None]:
# Analysis of Numerical Variables
fig, axis = plt.subplots(6, 2, figsize = (10, 5), gridspec_kw={'height_ratios': [6, 1, 6, 1, 6, 1]})

# Set of colors
colors = [
    "#FF6F61", "#6B5B95", "#88B04B", "#F7CAC9", "#92A8D1", "#955251", "#B565A7", "#009B77", "#DD4124", "#45B8AC",
    "#EFC050", "#1F3A93", "#FF784F", "#2E86AB", "#A4C639", "#D94F70", "#7BC8A4", "#3F51B5", "#F4A261", "#66CC99",
    "#E84A5F", "#4682B4", "#FFB347", "#6A0572", "#AB83A1", "#FFD1DC", "#5DADE2", "#A8E6CF", "#DC143C", "#228B22",
    "#FF69B4", "#4169E1", "#FF4500", "#20B2AA", "#9932CC", "#F08080", "#00CED1", "#ADFF2F", "#BA55D3", "#87CEEB",
    "#FF6347", "#40E0D0", "#C71585", "#98FB98", "#FFA07A", "#00FA9A", "#6495ED", "#FF8C00", "#90EE90", "#DB7093"
]

# Create a multiple figure with histograms and box plots
sns.histplot(ax = axis[0, 0], data = df, x = "var1", bins=50, color="#1f77b4").set(xlabel = None)
sns.boxplot(ax = axis[1, 0], data = df, x = "var1", color="#1f77b4").set(xlabel = "var1")

sns.histplot(ax = axis[0, 1], data = df, x = "var2", bins=50, color="#ff7f0e").set(xlabel = None)
sns.boxplot(ax = axis[1, 1], data = df, x = "var2", color="#ff7f0e").set(xlabel = "var2")

sns.histplot(ax = axis[2, 0], data = df, x = "var3", bins=50, color="#2ca02c").set(xlabel = None)
sns.boxplot(ax = axis[3, 0], data = df, x = "var3", color="#2ca02c").set(xlabel = "var3")

sns.histplot(ax = axis[2, 1], data = df, x = "var4", bins=50, color="#d62728").set(xlabel = None)
sns.boxplot(ax = axis[3, 1], data = df, x = "var4", color="#d62728").set(xlabel = "var4")

# ......

# Delete axis
axis[3, 1].axis("off")

# Adjust the Layout
plt.tight_layout()

# Display the Plot
plt.show()

In [None]:
# Numerical - Numerical Analysis, Seeking Correlations Between Specific Variables in My Dataset
fig, axis = plt.subplots(10, 2, figsize = (12, 17))

# Create a Multiple Scatter Plot
sns.regplot(ax = axis[0, 0], data = df, x = "var1", y = "target")
sns.heatmap(df[["taget", "var1"]].corr(), annot = True, fmt = ".2f", ax = axis[1, 0], cbar = False)

sns.regplot(ax = axis[0, 1], data = df, x = "var2", y = "target").set(ylabel=None)
sns.heatmap(df[["target", "var2"]].corr(), annot = True, fmt = ".2f", ax = axis[1, 1])

sns.regplot(ax = axis[2, 0], data = df, x = "var3", y = "target")
sns.heatmap(df[["target", "var3"]].corr(), annot = True, fmt = ".2f", ax = axis[3, 0], cbar = False)

sns.regplot(ax = axis[2, 1], data = df, x = "var4", y = "target").set(ylabel=None)
sns.heatmap(df[["target", "var4"]].corr(), annot = True, fmt = ".2f", ax = axis[3, 1])

# ......

# Delete axis
axis[3, 1].axis("off")

# Adjust the Layout
plt.tight_layout()

# Display the Plot
plt.show()

In [None]:
# Categorical - Categorical Analysis
fig, axis = plt.subplots(4, 2, figsize = (10, 15))

sns.countplot(ax = axis[0, 0], data = df, x = "var1", palette='husl', hue= "target", legend=True)

# Rotation
axis[0, 0].tick_params(axis='x', rotation=45)

sns.countplot(ax = axis[0, 1], data = df, x = "var2", palette='husl', hue= "target", legend=True)

sns.countplot(ax = axis[1, 0], data = df, x = "var3", palette='husl', hue= "target", legend=True)

sns.countplot(ax = axis[1, 1], data = df, x = "var4", palette='husl', hue= "target", legend=True)

# ......

# Delete axis
axis[3, 1].axis("off")

# Adjust the Layout
plt.tight_layout()

# Display the Plot
plt.show()

In [None]:
# Complete Numerical - Categorical Analysis

# Convert, if necessary, categorical variables to numerical ones, in a simple way using pd.factorize()
df["var1_num"] = pd.factorize(df["var1"])[0]

# Convert a numeric column with erroneous values to numeric
df["var1"] = pd.to_numeric(df["var1"], errors="coerce")
# Check the data type after the conversion
df["var1"].dtype
# Check if the conversion generated missing values
df["var1"].isnull().sum()

fig, axis = plt.subplots(figsize = (15, 10))

sns.heatmap(df[["var1_num", "var2", "var3_num", "var4", "..."]].corr(), annot = True, fmt = ".2f")

# Adjust the Layout
plt.tight_layout()

# Display the Plot
plt.show()

In [None]:
# All-to-All Relationships
sns.pairplot(data = df)
plt.show()

In [None]:
# Final exploration of missing values
df.isnull().sum().sort_values(ascending=False)

In [None]:
# Cleaning outliers

# Interquartile range
var1_info = df["var1"].describe()
var1_iqr = var1_info["75%"] - var1_info["25%"]
up_limit = var1_info["75%"] + 1.5 * var1_iqr
low_limit = var1_info["25%"] - 1.5 * var1_iqr

In [None]:
# Resampling example
# Split the dataset into subsets based on labels
label_0 = df[df['var1'] == 0]
label_1 = df[df['var1'] == 1]
label_2 = df[df['var1'] == 2]

# Determine the maximum size to balance
max_size = max(len(label_0), len(label_1), len(label_2))

# Augmenting underrepresented labels with synthetic data
label_0_balanced = resample(label_0, replace=True, n_samples=max_size, random_state=42)
label_1_balanced = resample(label_1, replace=True, n_samples=max_size, random_state=42)
label_2_balanced = resample(label_2, replace=True, n_samples=max_size, random_state=42)

# Combining balanced datasets
df = pd.concat([label_0_balanced, label_1_balanced, label_2_balanced])

# Mix the rows to avoid any order
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Check the balance
print(df['va1'].value_counts())

In [None]:
# Splitting data into train and test sets
X = df.drop("target", axis = 1)
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42) # stratify=y for imbalanced classes in the target


# Save my information in two .csv files for work from app.py
# Join X_train with y_train
train_data = pd.concat([X_train, y_train], axis=1)

# Join X_test with y_test
test_data = pd.concat([X_test, y_test], axis=1)

# Save to CSV files
train_data.to_csv('../data/processed/train_data.csv', index=False)
test_data.to_csv('../data/processed/test_data.csv', index=False)

In [None]:
# Vectorize data if necessary

# Function to clean text
def clean_text(texto):
    texto = texto.lower() # Lower case
    texto = re.sub(r'\d+', '', texto) # Numbers
    texto = re.sub(r'[^\w\s]', '', texto) # Punctuation marks
    texto = re.sub(r'\s+', ' ', texto).strip() # Multiple spaces and at the beginning and end
    return texto

# Clean up text in column "var1"
df["var1"] = df["var1"].apply(clean_text)

# Vectorization with TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words="english")

X_train_tfidf = vectorizer.fit_transform(X_train["review"])
X_test_tfidf = vectorizer.transform(X_test["review"])

In [None]:
# Coding

# With Label Encoder
# Instantiate the encoder
label_encoder_var1 = LabelEncoder()
# Train the encoder with the training data
label_encoder_var1.fit(X_train['var1'])
# Apply the encoder on both
X_train['var1_le'] = label_encoder_var1.transform(X_train['var1'])
X_test['var1_le'] = label_encoder_var1.transform(X_test['var1'])
# After checking the coding, remove the uncoded feature
X_train = X_train.drop("var1", axis=1)
X_test = X_test.drop("var1", axis=1)

# With Ordinal Encoder
# Instantiate the encoder with the order of the categories
ordinal_encoder_var1 = OrdinalEncoder(categories=[["cat1", "cat2", "cat3"]])
# Train the encoder with the training data
ordinal_encoder_var1.fit(X_train['var1'])
# Apply the encoder on both
X_train['var1_oe'] = ordinal_encoder_var1.transform(X_train['var1'])
X_test['var1_oe'] = ordinal_encoder_var1.transform(X_test['var1'])
# After checking the coding, remove the uncoded feature
X_train = X_train.drop("var1", axis=1)
X_test = X_test.drop("var1", axis=1)

# Alternative with pd.factorize() or pd.get_dummies()
df["var1_num"] = pd.factorize(df["var1"])[0]
df = pd.get_dummies(df, columns=["var1", "var2", "var3"])


In [None]:
# Scaling
# Instantiate the scaler
scaler = StandardScaler().fit(X_train) # or MinMaxScaler
# Transformation and conversion to dataframe of scaled data
X_train_scaled = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

In [None]:
# Selecting the best parameters for the model
selection_model = SelectKBest(mutual_info_regression, k = 5).fit(X_train_scaled, y_train) # f_regression, pearsonr, mutual_info_regression

# Train the model
selection_model.fit(X_train_scaled, y_train)
ix = selection_model.get_support()

# Apply the model
X_train_sel = pd.DataFrame(selection_model.transform(X_train_scaled), columns = X_train_scaled.columns.values[ix])
X_test_sel = pd.DataFrame(selection_model.transform(X_test_scaled), columns = X_test_scaled.columns.values[ix])

In [None]:
# ML Models
model = RandomForestClassifier(random_state = 42)
# Training
model.fit(X_train_sel, y_train)
# Predict
y_pred_test = model.predict(X_test_sel)
y_pred_train = model.predict(X_train_sel)

In [None]:
# Metrics
# Classification
accuracy_test = accuracy_score(y_test, y_pred_test)
accuracy_train = accuracy_score(y_train, y_pred_train)
f1_score_test = f1_score(y_test, y_pred_test)
f1_score_train = f1_score(y_train, y_pred_train)
precision_test = precision_score(y_test, y_pred_test)
precision_train = precision_score(y_train, y_pred_train)
recall_test = recall_score(y_test, y_pred_test)
recall_train = recall_score(y_train, y_pred_train)

print("Accuracy Test: ", accuracy_test)
print("F1 score Test: ", f1_score_test)
print("Precision Test: ", precision_test)
print("Recall Test: ", recall_test)

print("Accuracy Train: ", accuracy_train)
print("F1 score Train: ", f1_score_train)
print("Precision Train: ", precision_train)
print("Recall Train: ", recall_train)

# Create the confusion matrix
cm = confusion_matrix(y_test, y_pred_test)

# Graph the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicción')
plt.ylabel('Verdadero')
plt.title('Matriz de Confusión')
plt.show()

# --------------------------------------------------

# Regression
mse_test = mean_squared_error(y_test, y_pred_test)
r2_score_test = r2_score(y_test, y_pred_test)
rmse_test = np.sqrt(mse_test)
mse_train = mean_squared_error(y_train, y_pred_train)
r2_score_train = r2_score(y_train, y_pred_train)
rmse_train = np.sqrt(mse_train)

print("MSE Test: ", mse_test)
print("R2 Score Test: ",r2_score_test)
print("RMSE Test: ", rmse_test)

print("MSE Train: ", mse_train)
print("R2 Score Train: ",r2_score_train)
print("RMSE Train: ", rmse_train)