In [None]:
!pip install -q streamlit


In [None]:
!npm install localtunnel


In [None]:
%%writefile app.py
import streamlit as st
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt

st.title("Task 1: Clustering")

train_data = pd.read_excel("/content/train.xlsx")
target_variable = train_data.pop("target")

#Selecting 5000 random rows to significantly reduce the training time
sample_indices = train_data.sample(n=5000, random_state=42).index  # Getting indices for alignment
selected_data = train_data.loc[sample_indices]
selected_target = target_variable.loc[sample_indices]  # Align with selected_data's index


X_train, X_test, y_train, y_test = train_test_split(selected_data, selected_target, test_size=0.2, random_state=42)


st.write("A note on how I approached the problem:  I tried using other algortihms for this problem including K-means, but SVC was showing better silhouette score. So I chose the SVC model and further finetuned the hyperparameters using Grid Search. I have not shown the implementation of either Grid Search or Kmeans in here because it would make the code extremely long and hard to read and also Grid Search takes a huge amount of time to come up with the best parameters.")

# Preprocessing the numerical features
scaler = StandardScaler()
scaled_train_data = scaler.fit_transform(X_train)

# SVC model (replace with best hyperparameters from grid search)
svc = SVC(C=0.1, gamma=0.1, kernel='rbf')

svc.fit(scaled_train_data, y_train)

# Predicting cluster labels
train_cluster_labels = svc.predict(scaled_train_data)

# Calculating silhouette score (optional)
silhouette_score_train = silhouette_score(scaled_train_data, train_cluster_labels)
print(f"Silhouette score (train): {silhouette_score_train:.4f}")

# A new data point (for testing purpose)
new_data_point = pd.DataFrame({"T1":-70,"T2":-59,"T3":-67,"T4":-58,"T5":-91,"T6":-99,"T7":-76,"T8":-54,"T9":-93,"T10":-72,"T11":-83,"T12":-54,"T13":-77,"T14":-65,"T15":-82,"T16":-88,"T17":-54,"T18":-76}, index=[0])

st.write("New data point added after having trained the model")
st.dataframe(new_data_point)

# Preprocessing
scaled_new_data_point = scaler.transform(new_data_point)

# Using the best model to predict the cluster label
predicted_cluster = svc.predict(scaled_new_data_point)

print(f"Predicted cluster for the new data point: {predicted_cluster[0]}")

st.write(f"Predicted cluster for the new data point: {predicted_cluster[0]}")

# Visualizing the cluster with the new data point
tsne = TSNE(n_components=2, random_state=42)  # Set random_state for reproducibility
reduced_train_data = tsne.fit_transform(scaled_train_data)
reduced_new_data_point = tsne.embedding_[0]  # Access the embedded new data point


# Converting string labels to numeric values for visualization
unique_labels = np.unique(train_cluster_labels)  # Get unique labels
label_to_number = {label: i for i, label in enumerate(unique_labels)}  # Create a mapping dictionary
numeric_labels = np.array([label_to_number[label] for label in train_cluster_labels])  # Convert labels

# Visualization
plt.figure(figsize=(6, 4))
plt.scatter(reduced_train_data[:, 0], reduced_train_data[:, 1], c=numeric_labels, cmap='viridis',)

legend_labels = [f"Cluster {label}" for label in unique_labels]
plt.legend(legend_labels, title="Clusters")
plt.scatter(reduced_new_data_point[0], reduced_new_data_point[1], c='red', marker='x', label='New Data Point')
plt.title("Cluster with New Data Point")
plt.xlabel("Dimension 1")
plt.ylabel("Dimension 2")
plt.legend(loc='upper right')
plt.tight_layout()
st.write("New data point plotted against previous data points to show the user visually which cluster the new data point belongs to. Each colour here is a different cluster. Since the number of clusters or labels were immense I used TSNE to reduce or scale it down for the purpose of clear visualization.")
st.pyplot(plt.gcf())


Overwriting app.py


In [None]:
!streamlit run /content/app.py &>/content/logs.txt &

In [None]:
!npx localtunnel --port 8501

[K[?25hnpx: installed 22 in 1.416s
your url is: https://many-frogs-try.loca.lt
^C


In [None]:
%%writefile app.py

import streamlit as st
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

st.title("Task 2: Clasification")

data = pd.read_excel("/content/train.xlsx")
X = data.drop("target", axis=1)  # Features
y = data["target"]  # Target variable
st.write("Algorithm Chosen: Random Forest")
# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# training with the Random Forest model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Making predictions on test set
y_pred = model.predict(X_test)

# Evaluating accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Train accuracy: {accuracy:.4f}")
st.write(f"Training accuracy: {accuracy:.4f}")
st.write("I tried using two Classifiers, SVM and Random Forest. The reason for that was Random Forest offers interpretability and handles potential imbalances well, while SVM excels at finding clear boundaries between classes. I tried both and Random Forest had the highest training accuracy on this dataset, that is why I decided to choose this algorithm.")

test_data = pd.read_excel("/content/test.xlsx")

X_test = test_data

# Generating predictions with Random Forest
predicted_targets = model.predict(X_test)

test_data["predicted_target"] = predicted_targets


print(test_data["predicted_target"])
st.write("Predicted target values on the test dataset:")
st.dataframe(test_data["predicted_target"])

Overwriting app.py


In [None]:
!streamlit run /content/app.py &>/content/logs.txt &

In [None]:
!npx localtunnel --port 8501

[K[?25hnpx: installed 22 in 1.956s
your url is: https://hungry-readers-hear.loca.lt
^C


In [None]:
#Exporting the target values to Excel file so that it can be easily shared
test_data["predicted_target"].to_excel("target_values.xlsx", index=False)

In [None]:
%%writefile app.py

import streamlit as st
import pandas as pd
st.title("Task 3(2): Datewise summation of picking and placing activity done.")

df = pd.read_excel("/content/rawdata.xlsx")
# Counting picking and placing activities by date
picking_counts = df[df['activity'] == 'picked'].groupby('date')['number'].sum()
placing_counts = df[df['activity'] == 'placed'].groupby('date')['number'].sum()


results = pd.DataFrame({'Picking': picking_counts, 'Placing': placing_counts})
print(results)


st.header("Datewise Picking/Placing Activity")

# Displaying the DataFrame directly with st.dataframe
st.dataframe(results)

# Displaying individual counts with st.metric
st.metric(label="Total Picking Activities", value=results['Picking'].sum())
st.metric(label="Total Placing Activities", value=results['Placing'].sum())

Overwriting app.py


In [None]:
!streamlit run /content/app.py &>/content/logs.txt &

In [None]:
!npx localtunnel --port 8501


[K[?25hnpx: installed 22 in 1.606s
your url is: https://giant-baths-think.loca.lt
^C
