In [27]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

In [28]:
# Load the dataset from a CSV file
df = pd.read_csv("/content/drive/MyDrive/CSV files/Celebbs.csv")
df.head(2)

Unnamed: 0,Name,Profession,Gender,Age,Birth_City,Birth_State,Nationality,Status
0,Tom Hanks,Actor,Male,68,Concord,California,American,Alive
1,Oprah Winfrey,TV Host/Producer,Female,71,Kosciusko,Mississippi,American,Alive


In [29]:
# Check for duplicates and remove them to ensure data quality
df.duplicated().sum()  # Displays the number of duplicate rows (for debugging)
df = df.drop_duplicates()  # Drops duplicate rows, keeping the first occurrence

In [30]:
# Separate features (X) and target (Y)
X = df.iloc[:, 1:]  # All columns except the first (assumed to be 'Name')
Y = df.iloc[:, :1]  # First column (assumed to be 'Name')
Y = Y.values.ravel()  # Flatten Y to a 1D array for sklearn compatibility

In [31]:
# Count occurrences of each profession in the dataset
profession_counts = X['Profession'].value_counts()

# Replace professions with 13 or fewer occurrences with 'Other' to simplify categories
X['Profession'] = X['Profession'].map(lambda x: 'Other' if profession_counts[x] <= 13 else x)

# Display updated profession counts (for verification, can be removed in production)
X['Profession'].value_counts()

Unnamed: 0_level_0,count
Profession,Unnamed: 1_level_1
Other,166
Actor,114
Actress,97
Singer,47
Entrepreneur,25
Musician,22
Politician,18
Cricketer,16


In [32]:
# Drop columns that are not needed for the model to reduce dimensionality
X.drop(columns=['Birth_City', 'Birth_State'], inplace=True)

# Binarize the 'Age' column into three categories for better feature representation
X['under_30'] = (X['Age'] < 30).astype(int)  # 1 if age < 30, else 0
X['bet_30 & 50'] = ((X['Age'] > 30) & (X['Age'] < 50)).astype(int)
X['bet_50 & 70'] = ((X['Age'] > 50) & (X['Age'] < 70)).astype(int)
X['bet_70 & 90'] = ((X['Age'] > 70) & (X['Age'] < 90)).astype(int)
X['over_90'] = (X['Age'] > 90).astype(int)  # 1 if age > 50, else 0
X.drop(columns=['Age'], inplace=True)  # Remove original 'Age' column after binarization

In [33]:
# Convert categorical columns to one-hot encoded binary features
X = pd.get_dummies(X, columns=['Profession', 'Nationality', 'Gender', 'Status']).astype(int)
X.head(3)

Unnamed: 0,under_30,bet_30 & 50,bet_50 & 70,bet_70 & 90,over_90,Profession_Actor,Profession_Actress,Profession_Cricketer,Profession_Entrepreneur,Profession_Musician,Profession_Other,Profession_Politician,Profession_Singer,Nationality_American,Nationality_Indian,Gender_Female,Gender_Male,Status_Alive,Status_Dead
0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0
1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0
2,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0


In [34]:
# Initialize and train a decision tree classifier using entropy as the splitting criterion
clf = DecisionTreeClassifier(criterion='entropy')
clf.fit(X, Y)  # Fit the model on the preprocessed features (X) and target (Y)

In [35]:
# Function to collect user input and predict a celebrity
def get_user_input_and_predict(clf, X_columns, profession_counts):
    """Collects user input for celebrity attributes and predicts a name using the trained model."""
    print("\n=== Guess Who! ===")
    print("Answer the following questions about the celebrity you're thinking of.")

    # Initialize a dictionary with all feature columns set to 0 (for one-hot encoding)
    user_input = {col: 0 for col in X_columns}

    # Collect age and set corresponding binary features
    age = int(input("How old is your celebrity? (Enter age in years): "))
    user_input['under_30'] = 1 if age < 30 else 0
    user_input['bet_30 & 50'] = 1 if 30 < age < 50 else 0
    user_input['bet_50 & 70'] = 1 if 50 < age < 70 else 0
    user_input['bet_70 & 90'] = 1 if 70 < age < 90 else 0
    user_input['over_70'] = 1 if age > 90 else 0

    # Collect profession, showing common ones as a guide
    print("Common professions in the dataset:", [p for p, c in profession_counts.items() if c > 13])
    profession = input("What is your celebrity's profession? (e.g., Actor, Singer, or 'Other'): ").strip()
    if profession in profession_counts and profession_counts[profession] > 13:
        user_input[f'Profession_{profession}'] = 1
    else:
        user_input['Profession_Other'] = 1

    # Collect nationality, listing possible options from the dataset
    nationalities = [col.split('_')[1] for col in X_columns if col.startswith('Nationality_')]
    print("Possible nationalities:", nationalities)
    nationality = input("What is your celebrity's nationality? (e.g., American, Indian): ").strip().capitalize()
    if f'Nationality_{nationality}' in X_columns:
        user_input[f'Nationality_{nationality}'] = 1
    else:
        print(f"Nationality '{nationality}' not in dataset; defaulting to no specific nationality.")

    # Collect gender
    gender = input("Is your celebrity male or female? (Enter 'Male' or 'Female'): ").strip().capitalize()
    if gender in ['Male', 'Female']:
        user_input[f'Gender_{gender}'] = 1
    else:
        print("Invalid gender input; defaulting to no specific gender.")

    # Collect status (alive or dead)
    status = input("Is your celebrity alive or dead? (Enter 'Alive' or 'Dead'): ").strip().capitalize()
    if status in ['Alive', 'Dead']:
        user_input[f'Status_{status}'] = 1
    else:
        print("Invalid status input; defaulting to no specific status.")

    # Convert user input to a DataFrame matching the training data structure
    user_df = pd.DataFrame([user_input], columns=X_columns)

    # Make a prediction using the trained model
    prediction = clf.predict(user_df)
    print(f"\nMy prediction: {prediction[0]}")

#Lets Testing!!

###First thinking is for Amitabh Bachchan

###Second lets for Shri Narendra Modi

In [36]:
# Run the game once
get_user_input_and_predict(clf, X.columns, profession_counts)

# Allow the user to play again in a loop
while True:
    play_again = input("\nWould you like to play again? (yes/no): ").strip().lower()
    if play_again == 'yes':
        get_user_input_and_predict(clf, X.columns, profession_counts)
    else:
        print("Thanks for playing!")
        break


=== Guess Who! ===
Answer the following questions about the celebrity you're thinking of.
How old is your celebrity? (Enter age in years): 82
Common professions in the dataset: ['Actor', 'Actress', 'Singer', 'Entrepreneur', 'Musician', 'Politician', 'Cricketer']
What is your celebrity's profession? (e.g., Actor, Singer, or 'Other'): Actor
Possible nationalities: ['American', 'Indian']
What is your celebrity's nationality? (e.g., American, Indian): Indian
Is your celebrity male or female? (Enter 'Male' or 'Female'): Male
Is your celebrity alive or dead? (Enter 'Alive' or 'Dead'): Alive

My prediction: Amitabh Bachchan

Would you like to play again? (yes/no): yes

=== Guess Who! ===
Answer the following questions about the celebrity you're thinking of.
How old is your celebrity? (Enter age in years): 74
Common professions in the dataset: ['Actor', 'Actress', 'Singer', 'Entrepreneur', 'Musician', 'Politician', 'Cricketer']
What is your celebrity's profession? (e.g., Actor, Singer, or 'Ot

# WoW !! Great Its guessing right.