In [1]:
#Task 1
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

np.random.seed(123)
n = 120

area = np.random.normal(1600, 250, n).astype(int)
rooms = np.random.randint(2, 6, n)
toilets = np.random.randint(1, 4, n)
property_age = np.random.randint(1, 40, n)
zone = np.random.choice(['X', 'Y', 'Z'], n)

cost = (
    area * 180 +
    rooms * 12000 +
    toilets * 14000 -
    property_age * 900 +
    np.where(zone == 'Y', 30000, 0) +
    np.where(zone == 'Z', 50000, 0) +
    np.random.normal(0, 25000, n)
).astype(int)

housing_df = pd.DataFrame({
    'area': area,
    'rooms': rooms,
    'toilets': toilets,
    'property_age': property_age,
    'zone': zone,
    'cost': cost
})

df_encoded = pd.get_dummies(housing_df, columns=['zone'], drop_first=True)
X = df_encoded.drop('cost', axis=1)
y = df_encoded['cost']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)
model = LinearRegression()
model.fit(X_train, y_train)

predicted = model.predict(X_test)
print(f"RMSE: Rs. {np.sqrt(mean_squared_error(y_test, predicted)):.2f}")
print(f"R²: {r2_score(y_test, predicted):.3f}")

house_input = pd.DataFrame([{
    'area': 1750,
    'rooms': 4,
    'toilets': 2,
    'property_age': 5,
    'zone_Y': 0,
    'zone_Z': 1
}])

estimated_cost = model.predict(house_input)[0]
print(f"Estimated House Cost: Rs. {estimated_cost:,.2f}")


RMSE: Rs. 26409.12
R²: 0.775
Estimated House Cost: Rs. 434,842.98


In [4]:
#Task 2
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from scipy.sparse import hstack

emails_df = pd.DataFrame({
    'content': [
        "Don't miss your chance to win a car now!",
        "Team sync at 4pm today",
        "Click this link to claim your bonus http://claim.com",
        "Let's catch up next week",
        "Cheap vacations await! http://scamholidays.com",
        "Update your credentials here http://hackersite.com",
        "Want to go for a walk?",
        "Win $5,000 instantly! http://cashgrab.net"
    ],
    'email_from': [
        "offer@dealz.com", "manager@corp.com", "promo@winnings.org",
        "colleague@office.com", "agent@travelspree.com",
        "alert@fraudulent.org", "buddy@mail.com", "jackpot@cashlottery.net"
    ],
    'is_spam': [1, 0, 1, 0, 1, 1, 0, 1]
})

emails_df['msg_len'] = emails_df['content'].apply(len)
emails_df['contains_link'] = emails_df['content'].str.contains("http", case=False).astype(int)
emails_df['sender_domain'] = emails_df['email_from'].apply(lambda x: x.split('@')[1])
emails_df = pd.get_dummies(emails_df, columns=['sender_domain'], drop_first=True)

tfidf = TfidfVectorizer(stop_words='english')
text_vectors = tfidf.fit_transform(emails_df['content'])

features = emails_df.drop(['content', 'email_from', 'is_spam'], axis=1)
X_full = hstack([text_vectors, features.astype(np.float64)])
y_full = emails_df['is_spam']

X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.3, random_state=123)
rf_model = RandomForestClassifier(n_estimators=100, random_state=123)
rf_model.fit(X_train, y_train)

y_preds = rf_model.predict(X_test)
print("📊 Classification Report:")
print(classification_report(y_test, y_preds))
print(f"✅ Accuracy: {accuracy_score(y_test, y_preds):.2f}")

def check_spam(email_body, sender_email):
    length = len(email_body)
    link_flag = int("http" in email_body.lower())
    domain = sender_email.split("@")[1]

    body_vector = tfidf.transform([email_body])
    sender_columns = [col for col in emails_df.columns if col.startswith("sender_domain_")]

    sender_vector = pd.DataFrame([{
        f"sender_domain_{domain}": 1 if f"sender_domain_{domain}" in sender_columns else 0
    }], columns=sender_columns).fillna(0)

    meta = pd.DataFrame([{
        'msg_len': length,
        'contains_link': link_flag
    }])

    full_input = hstack([body_vector, pd.concat([meta, sender_vector], axis=1).astype(np.float64)])
    prediction = rf_model.predict(full_input)[0]
    return "Spam" if prediction == 1 else "Not Spam"

sample_email = "Let's meet at the cafe around 6?"
sample_sender = "pal@friendsmail.com"
print(f"📧 Email classified as: {check_spam(sample_email, sample_sender)}")


📊 Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.33      1.00      0.50         1

    accuracy                           0.33         3
   macro avg       0.17      0.50      0.25         3
weighted avg       0.11      0.33      0.17         3

✅ Accuracy: 0.33
📧 Email classified as: Spam


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

np.random.seed(42)
n_customers = 300

spending = np.random.normal(50000, 15000, n_customers).clip(10000, 100000)
age = np.random.normal(40, 12, n_customers).clip(18, 80)
visits = np.random.poisson(10, n_customers)
frequency = np.random.uniform(0.2, 1.5, n_customers)

labels = (spending > 60000) & (frequency > 0.8)
labels = labels.astype(int)

df = pd.DataFrame({
    'spending_6_months': spending,
    'age': age,
    'visits': visits,
    'purchase_freq': frequency,
    'is_high_value': labels
})

for col in ['spending_6_months', 'age']:
    df.loc[df.sample(frac=0.05).index, col] = np.nan

print("First 5 rows of the dataset:")
print(df.head())

imputer = SimpleImputer(strategy="mean")
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

df_imputed['spending_6_months'] = df_imputed['spending_6_months'].clip(10000, 100000)
df_imputed['age'] = df_imputed['age'].clip(18, 80)

features = ['spending_6_months', 'age', 'visits', 'purchase_freq']
scaler = StandardScaler()
df_imputed[features] = scaler.fit_transform(df_imputed[features])

X = df_imputed[features]
y = df_imputed['is_high_value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

svm = SVC(kernel='linear')
svm.fit(X_train, y_train)

y_pred = svm.predict(X_test)

print("\n Classification Report:")
print(classification_report(y_test, y_pred))
print(f" Accuracy: {accuracy_score(y_test, y_pred):.2f}")

print("\n Feature Importance (SVM Coefficients):")
for feature, weight in zip(features, svm.coef_[0]):
    print(f"{feature}: {weight:.4f}")

print("\n Inferred Rules for Classification:")
for feature, weight in zip(features, svm.coef_[0]):
    direction = "↑" if weight > 0 else "↓"
    impact = "more likely HIGH-VALUE" if weight > 0 else "more likely LOW-VALUE"
    print(f"{direction} {feature} → {impact}")


First 5 rows of the dataset:
   spending_6_months        age  visits  purchase_freq  is_high_value
0       57450.712295  30.052060       7       0.972819              0
1       47926.035482  33.277828      12       0.845797              0
2       59715.328072  48.967523      12       1.484121              0
3       72845.447846  47.324443       7       0.377372              0
4       46487.699379        NaN       7       1.103688              0

 Classification Report:
              precision    recall  f1-score   support

         0.0       0.93      0.96      0.94        77
         1.0       0.70      0.54      0.61        13

    accuracy                           0.90        90
   macro avg       0.81      0.75      0.78        90
weighted avg       0.89      0.90      0.89        90

 Accuracy: 0.90

 Feature Importance (SVM Coefficients):
spending_6_months: 1.7775
age: 0.0190
visits: -0.3381
purchase_freq: 1.3529

 Inferred Rules for Classification:
↑ spending_6_months → more li