<a href="https://colab.research.google.com/github/Salmanbhatti123/Al-ml-task2/blob/main/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:

from datasets import load_dataset
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import torch
import gradio as gr


dataset = load_dataset("ag_news")
print("Available columns:", dataset["train"].column_names)


text_column = "text"
if text_column not in dataset["train"].column_names:
    raise KeyError(f"Expected 'text' column not found. Found columns: {dataset['train'].column_names}")


tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

def tokenize_function(example):
    return tokenizer(example[text_column], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns([text_column])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")


model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average='weighted')
    }


training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    save_total_limit=2
)

# === 9. Trainer Setup ===
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"].shuffle(seed=42).select(range(10000)),  # Reduce for speed
    eval_dataset=tokenized_datasets["test"].select(range(2000)),
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


trainer.train()

eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)


id2label = {
    0: "World",
    1: "Sports",
    2: "Business",
    3: "Science/Technology"
}

def classify_news(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=1)
    return {id2label[i]: float(probs[0][i]) for i in range(4)}

gr.Interface(fn=classify_news, inputs="text", outputs="label", title="News Topic Classifier").launch()


Available columns: ['text', 'label']


Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [13]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import joblib

url = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"
df = pd.read_csv(url)


df = df.drop(columns=['customerID'])
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df = df.dropna()


X = df.drop("Churn", axis=1)
y = df["Churn"].map({"Yes": 1, "No": 0})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns


numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numerical_cols),
    ("cat", categorical_transformer, categorical_cols)
])


clf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])


param_grid = {
    "classifier__n_estimators": [100, 200],
    "classifier__max_depth": [None, 10, 20]
}

grid_search = GridSearchCV(clf_pipeline, param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

y_pred = grid_search.predict(X_test)
print("Best Params:", grid_search.best_params_)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


joblib.dump(grid_search.best_estimator_, "churn_prediction_pipeline.joblib")
print("✅ Model saved as churn_prediction_pipeline.joblib")


Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Params: {'classifier__max_depth': 10, 'classifier__n_estimators': 100}

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.91      0.87      1033
           1       0.66      0.49      0.56       374

    accuracy                           0.80      1407
   macro avg       0.74      0.70      0.71      1407
weighted avg       0.78      0.80      0.79      1407

Confusion Matrix:
 [[938  95]
 [192 182]]
✅ Model saved as churn_prediction_pipeline.joblib


In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
import pandas as pd
import numpy as np
from PIL import Image, ImageDraw
import os


os.makedirs("sample_images", exist_ok=True)
for i in range(1, 4):
    img = Image.new('RGB', (128, 128), color=(i*50, 100, 150))
    d = ImageDraw.Draw(img)
    d.text((10, 60), f"Img {i}", fill=(255, 255, 0))
    img.save(f"sample_images/img{i}.jpg")


data = {
    'image_path': ['sample_images/img1.jpg', 'sample_images/img2.jpg', 'sample_images/img3.jpg'],
    'weight': [0.5, 0.8, 0.6],
    'price': [10.0, 15.0, 12.5],
    'label': [0, 1, 0]
}
df = pd.DataFrame(data)


class MultimodalDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.df = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image = Image.open(row['image_path']).convert("RGB")
        if self.transform:
            image = self.transform(image)
        tabular = torch.tensor([row['weight'], row['price']], dtype=torch.float)
        label = torch.tensor(row['label'], dtype=torch.long)
        return image, tabular, label


transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
])


dataset = MultimodalDataset(df, transform=transform)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)


class MultimodalClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.cnn = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
        self.cnn.fc = nn.Identity()
        self.image_fc = nn.Linear(512, 128)
        self.tabular_fc = nn.Sequential(
            nn.Linear(2, 32),
            nn.ReLU(),
            nn.Linear(32, 16)
        )
        self.classifier = nn.Sequential(
            nn.Linear(128 + 16, 64),
            nn.ReLU(),
            nn.Linear(64, 2)
        )

    def forward(self, image, tabular):
        img_feat = self.image_fc(self.cnn(image))
        tab_feat = self.tabular_fc(tabular)
        combined = torch.cat((img_feat, tab_feat), dim=1)
        return self.classifier(combined)


model = MultimodalClassifier()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)


model.train()
for epoch in range(3):
    for images, tabular, labels in dataloader:
        outputs = model(images, tabular)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}: Loss = {loss.item():.4f}")


Epoch 1: Loss = 0.3110
Epoch 2: Loss = 1.1049
Epoch 3: Loss = 0.4060
