<a href="https://colab.research.google.com/github/VipulPhatangare/Nurathon/blob/main/Nurathon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required libraries
!pip install tensorflow

# Import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Concatenate, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from google.colab import drive


# Load data
file_path = "/content/drive/MyDrive/111/train.csv"
df = pd.read_csv(file_path)
df_test = pd.read_csv("/content/drive/MyDrive/111/test.csv")

# Enhanced date processing with cyclical encoding
def convert_mixed_dates(date_str):
    try:
        if "-" in date_str:
            return pd.to_datetime(date_str, format="%d-%m-%Y")
        elif "/" in date_str:
            return pd.to_datetime(date_str, format="%m/%d/%Y")
        return pd.NaT
    except:
        return pd.NaT

for dataset in [df, df_test]:
    dataset["Time_Logged"] = dataset["Time_Logged"].astype(str).apply(convert_mixed_dates)
    dataset["Day"] = dataset["Time_Logged"].dt.day
    dataset["Month"] = dataset["Time_Logged"].dt.month
    dataset["Year"] = dataset["Time_Logged"].dt.year

    # Cyclical encoding for day/month
    dataset['Day_sin'] = np.sin(2 * np.pi * dataset['Day']/31)
    dataset['Day_cos'] = np.cos(2 * np.pi * dataset['Day']/31)
    dataset['Month_sin'] = np.sin(2 * np.pi * dataset['Month']/12)
    dataset['Month_cos'] = np.cos(2 * np.pi * dataset['Month']/12)

    dataset.drop(['Time_Logged', 'Day', 'Month'], axis=1, inplace=True)

# Encode target
la = LabelEncoder()
df['Product_Category'] = la.fit_transform(df['Product_Category'])

# Handle missing values
categorical_cols = ["Region", "Location_Code"]
for col in categorical_cols:
    for dataset in [df, df_test]:
        dataset[col].fillna(dataset[col].mode()[0], inplace=True)

# Encode categorical features
categorical_columns = ["Organization", "Region", "Location_Code", "Channel", "Org_Reply", "Timely_Flag"]
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
df[categorical_columns] = ordinal_encoder.fit_transform(df[categorical_columns])
df_test[categorical_columns] = ordinal_encoder.transform(df_test[categorical_columns])

# Enhanced TF-IDF Vectorization with n-grams
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), stop_words='english')
X_text = vectorizer.fit_transform(df['Customer_Feedback']).toarray()
X_test_text = vectorizer.transform(df_test['Customer_Feedback']).toarray()

# Process structured data
structured_cols = [col for col in df.columns if col not in ['Customer_Feedback', 'Product_Category']]
scaler = StandardScaler()
X_structured = scaler.fit_transform(df[structured_cols])
X_test_structured = scaler.transform(df_test[structured_cols])

# Hybrid neural network architecture
# Text branch
text_input = Input(shape=(X_text.shape[1],), name='text_input')
text_dense = Dense(512, activation='relu')(text_input)
text_bn = BatchNormalization()(text_dense)
text_dropout = Dropout(0.5)(text_bn)

# Structured data branch
struct_input = Input(shape=(X_structured.shape[1],), name='struct_input')
struct_dense = Dense(256, activation='relu')(struct_input)
struct_bn = BatchNormalization()(struct_dense)
struct_dropout = Dropout(0.3)(struct_bn)

# Concatenate branches
combined = Concatenate()([text_dropout, struct_dropout])
dense2 = Dense(128, activation='relu', kernel_regularizer='l2')(combined)
output = Dense(len(la.classes_), activation='softmax')(dense2)

model = Model(inputs=[text_input, struct_input], outputs=output)
model.compile(optimizer=Adam(0.0001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Class weights for imbalance
class_weights = compute_class_weight('balanced', classes=np.unique(df['Product_Category']),
                                    y=df['Product_Category'])
class_weight_dict = dict(enumerate(class_weights))

# Early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Stratified train-validation split
X_train_text, X_val_text, X_train_struct, X_val_struct, y_train, y_val = train_test_split(
    X_text, X_structured, df['Product_Category'],
    test_size=0.2, random_state=42
)

# Train model
history = model.fit(
    [X_train_text, X_train_struct], y_train,
    validation_data=([X_val_text, X_val_struct], y_val),
    epochs= 50,
    batch_size=128,
    class_weight=class_weight_dict,
    callbacks=[early_stop],
    verbose=1
)

# Evaluate
val_loss, val_acc = model.evaluate([X_val_text, X_val_struct], y_val)
print(f"\nValidation Accuracy: {val_acc:.4f}")

# Predict and save
test_predictions = model.predict([X_test_text, X_test_structured])
df_test['Predicted_Product_Category'] = la.inverse_transform(np.argmax(test_predictions, axis=1))
df_test[['ID','Predicted_Product_Category']].to_csv("optimized_submission.csv", index=False)

print("Submission file ready!")



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[col].fillna(dataset[col].mode()[0], inplace=True)


Epoch 1/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 142ms/step - accuracy: 0.1157 - loss: 4.7988 - val_accuracy: 0.2789 - val_loss: 4.4957
Epoch 2/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 130ms/step - accuracy: 0.5585 - loss: 3.4514 - val_accuracy: 0.6049 - val_loss: 3.4932
Epoch 3/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 127ms/step - accuracy: 0.7041 - loss: 2.6025 - val_accuracy: 0.7180 - val_loss: 2.6310
Epoch 4/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 133ms/step - accuracy: 0.7665 - loss: 2.1421 - val_accuracy: 0.7458 - val_loss: 2.3140
Epoch 5/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 128ms/step - accuracy: 0.8078 - loss: 1.8353 - val_accuracy: 0.7582 - val_loss: 2.0964
Epoch 6/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 127ms/step - accuracy: 0.8307 - loss: 1.5952 - val_accuracy: 0.7678 - val_loss: 1.9039
Epoch 7/50