In [None]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download once
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
DATA_PATH = "/content/customer_support_tickets.csv"
df = pd.read_csv(DATA_PATH)

print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())


Dataset shape: (8469, 17)
Columns: ['Ticket ID', 'Customer Name', 'Customer Email', 'Customer Age', 'Customer Gender', 'Product Purchased', 'Date of Purchase', 'Ticket Type', 'Ticket Subject', 'Ticket Description', 'Ticket Status', 'Resolution', 'Ticket Priority', 'Ticket Channel', 'First Response Time', 'Time to Resolution', 'Customer Satisfaction Rating']


In [None]:
id_like_columns = [
    col for col in df.columns
    if "id" in col.lower() or df[col].nunique() == df.shape[0]
]

df.drop(columns=id_like_columns, inplace=True, errors="ignore")


In [None]:
object_cols = df.select_dtypes(include="object").columns.tolist()

if not object_cols:
    raise ValueError("No text/categorical columns found.")

TEXT_COLUMN = max(
    object_cols,
    key=lambda c: df[c].astype(str).str.len().mean()
)

print("Detected text column:", TEXT_COLUMN)


Detected text column: Ticket Description


In [None]:
candidate_targets = [
    col for col in object_cols
    if col != TEXT_COLUMN and df[col].nunique() <= 20
]

if candidate_targets:
    TARGET_COLUMN = candidate_targets[-1]
    print("Detected target column:", TARGET_COLUMN)
else:
    TARGET_COLUMN = None
    print("No target column detected â†’ Unsupervised mode")


Detected target column: Ticket Channel


In [None]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(tokens)

df[TEXT_COLUMN] = df[TEXT_COLUMN].apply(clean_text)


In [None]:
if TARGET_COLUMN:
    X = df.drop(columns=[TARGET_COLUMN])
    y = df[TARGET_COLUMN]
else:
    X = df.copy()
    y = None


In [None]:
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

if TEXT_COLUMN in categorical_features:
    categorical_features.remove(TEXT_COLUMN)


In [None]:
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])


In [None]:
categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])


In [None]:
text_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        max_features=5000,
        ngram_range=(1, 2),
        min_df=2
    ))
])


In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, numeric_features),
        ("cat", categorical_pipeline, categorical_features),
        ("txt", text_pipeline, TEXT_COLUMN)
    ]
)


In [None]:
if y is not None:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        random_state=42,
        stratify=y
    )

    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    print("Train shape:", X_train_processed.shape)
    print("Test shape:", X_test_processed.shape)

else:
    X_processed = preprocessor.fit_transform(X)
    print("Processed data shape:", X_processed.shape)


Train shape: (6775, 27786)
Test shape: (1694, 27786)


To upload your code to GitHub, you'll generally follow these steps within your Colab environment:

1.  **Configure Git**: Set up your user name and email.
2.  **Initialize a Git repository**: This turns your Colab working directory into a Git repository.
3.  **Add remote origin**: Link your local repository to your GitHub repository.
4.  **Stage and commit files**: Select the files you want to upload and commit them to your local repository.
5.  **Push to GitHub**: Transfer your committed changes to the remote GitHub repository.

**Important**: For authentication, you will need a GitHub Personal Access Token (PAT). Create one from your GitHub settings (Developer settings -> Personal access tokens -> Tokens (classic)). Give it `repo` scope. **Do not paste your PAT directly into the notebook.** Instead, save it as a secret in Colab (left panel, 'ðŸ”‘' icon) named `GITHUB_TOKEN`.

In [None]:
import os

# Configure Git with your user name and email
!git config --global user.name "RusticHalo012"
!git config --global user.email "sayansarkar5515@gmail.com"

# Replace "Your Name" and "your_email@example.com" with your actual GitHub name and email.

Next, initialize a Git repository in your current working directory. If you're working within `/content/`, this will be the root for your repository. Then, link it to your GitHub repository.

In [None]:
# Initialize a new Git repository
!git init

# Add a remote origin to your GitHub repository
# Replace <YOUR_GITHUB_USERNAME> and <YOUR_REPO_NAME> with your actual GitHub username and repository name
!git remote add origin https://github.com/RusticHalo012/Infosys_Springboard_Internship_Sayan_Sarkar_batch11.git

# Verify the remote was added
!git remote -v

Reinitialized existing Git repository in /content/.git/
error: remote origin already exists.
origin	https://github.com/RusticHalo012/Infosys_Springboard_Internship_Sayan_Sarkar_batch11.git (fetch)
origin	https://github.com/RusticHalo012/Infosys_Springboard_Internship_Sayan_Sarkar_batch11.git (push)


Now, add your files to the staging area and commit them. The `.` command adds all current files. You can specify individual files or directories if preferred.

In [None]:
# Add all files to the staging area
!git add .

# Commit the changes
!git commit -m "Initial commit from Google Colab"

# Check the status of your repository
!git status

On branch master
nothing to commit, working tree clean
On branch master
nothing to commit, working tree clean


Finally, push your committed changes to your GitHub repository. You will use your Personal Access Token for authentication. It's securely retrieved from Colab's secrets manager.

In [None]:
from google.colab import userdata

# Retrieve your GitHub Personal Access Token from Colab secrets
# Make sure to create a GitHub Personal Access Token (PAT) with 'repo' scope
# and add it to Colab's secrets manager (left panel, 'ðŸ”‘' icon) named `GITHUB_TOKEN`.
GITHUB_TOKEN = userdata.get('GITHUB_TOKEN')

# Construct the remote URL with the token for authentication
# Replace <YOUR_GITHUB_USERNAME> and <YOUR_REPO_NAME> with your actual GitHub username and repository name
# Example: remote_url = f"https://{GITHUB_TOKEN}@github.com/YourGitHubUsername/YourRepoName.git"
remote_url = f"https://{GITHUB_TOKEN}@github.com/RusticHalo012/Infosys_Springboard_Internship_Sayan_Sarkar_batch11.git"

# Push the changes to your master branch
# The error indicates that the local branch is 'master', not 'main'.
!git push {remote_url} master

print("Code successfully pushed to GitHub!")

Everything up-to-date
Code successfully pushed to GitHub!
