# Lab 9. Visualización Interactiva

## Miembros
- Fernanda Esquivel 21542
- Sofía Lam 21548

## Link al repositorio
El repositorio puede ser visualizado [acá](https://github.com/SofiLam13/Lab9-DS-)

## Link al documento
El documento puede ser visualizado [acá](https://github.com/SofiLam13/Lab9-DS-)


# Visualización con Streamlit

In [None]:
from streamlit_jupyter import StreamlitPatcher, jupyter_server
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

# Enable Streamlit inside Jupyter
StreamlitPatcher().patch()

# Load the dataset
data_path = 'path/to/your/train.csv'
train_df = pd.read_csv(data_path)

# Sidebar - User Inputs
st.sidebar.header('User Input Features')
keyword_filter = st.sidebar.multiselect('Select keyword(s)', train_df['keyword'].unique())
target_filter = st.sidebar.selectbox('Select disaster classification', [0, 1])

# Filter Data
filtered_df = train_df[train_df['keyword'].isin(keyword_filter)] if keyword_filter else train_df
filtered_df = filtered_df[filtered_df['target'] == target_filter]

# Display filtered data
st.write("### Filtered Data", filtered_df)

# Model Training
X = train_df['text'].values
y = train_df['target'].values

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Logistic Regression Model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logreg_pred = logreg.predict(X_test)

# Decision Tree Model
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
tree_pred = tree.predict(X_test)

# Random Forest Model
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

# Model Comparison Table
models = {
    'Logistic Regression': logreg_pred,
    'Decision Tree': tree_pred,
    'Random Forest': rf_pred
}

model_selection = st.sidebar.multiselect('Select Models to Compare', list(models.keys()))

# Display comparison results
if model_selection:
    for model in model_selection:
        st.write(f"### {model}")
        st.write(confusion_matrix(y_test, models[model]))
        st.write(classification_report(y_test, models[model]))

# Violin Plot: Tweet Length vs Classification
train_df['text_length'] = train_df['text'].str.len()

plt.figure(figsize=(10, 6))
sns.violinplot(x='target', y='text_length', data=train_df, palette=['#E7ECEF', '#274C77'])
st.pyplot(plt)

# Scatter Plot for Text Length vs Disaster/Non-Disaster
plt.figure(figsize=(10, 6))
sns.scatterplot(x='text_length', y='target', data=train_df, hue='target', palette=['#6096BA', '#A3CEF1'])
st.pyplot(plt)

# Heatmap for keyword frequency
keyword_counts = train_df['keyword'].value_counts().head(20)

plt.figure(figsize=(10, 8))
sns.heatmap(keyword_counts.to_frame().T, annot=True, cmap='Blues')
st.pyplot(plt)

# Start the Streamlit server in Jupyter
jupyter_server()
