In [39]:
import pandas as pd

# Sample DataFrame
data = {
    'age':    [24, 26, 27, 29, 38, 40, 47, 51, 62 ],
    'salary': [50000, 65000, 72000, 80000, 150000, 175000, 250000, 300000, 380000],
    'years_at_company_amazon': [1, 2, 4, 5, 7, 9, 10, 12, 13 ]
}

df = pd.DataFrame(data)

# Creating a new feature: Experience Level
df['experience_level'] = pd.cut(df['years_at_company_amazon'], bins=[0, 4, 8, 15], labels=['Junior', 'Mid', 'Senior'])

# Creating interaction feature: Age * Salary
df['age_salary_interaction'] = df['age'] * df['salary']

# Display the modified DataFrame
print(df)


   age  salary  years_at_company_amazon experience_level  \
0   24   50000                        1           Junior   
1   26   65000                        2           Junior   
2   27   72000                        4           Junior   
3   29   80000                        5              Mid   
4   38  150000                        7              Mid   
5   40  175000                        9           Senior   
6   47  250000                       10           Senior   
7   51  300000                       12           Senior   
8   62  380000                       13           Senior   

   age_salary_interaction  
0                 1200000  
1                 1690000  
2                 1944000  
3                 2320000  
4                 5700000  
5                 7000000  
6                11750000  
7                15300000  
8                23560000  


In [13]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Standardizing the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[['age', 'salary', 'years_at_company']])

# Applying PCA
pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_features)

# Convert to DataFrame
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
print(pca_df)


        PC1       PC2
0  2.453217 -0.238571
1  1.369587  0.120639
2 -0.324333  0.308694
3 -1.203129  0.075348
4 -2.295341 -0.266110


In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

# Generating a sample dataset
X, y = make_classification(n_samples=100, n_features=5, random_state=42)

# Training a Random Forest model
model = RandomForestClassifier()
model.fit(X, y)

# Extracting feature importances
importances = model.feature_importances_
feature_names = [f'Feature {i}' for i in range(X.shape[1])]

# Displaying feature importances
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print(importance_df)


     Feature  Importance
3  Feature 3    0.525202
1  Feature 1    0.312829
0  Feature 0    0.083347
2  Feature 2    0.041214
4  Feature 4    0.037408


In [15]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# Initializing a Logistic Regression model
model = LogisticRegression()

# Running RFE to select the top 3 features
rfe = RFE(model, n_features_to_select=3)
fit = rfe.fit(X, y)

# Selecting the features
selected_features = [feature_names[i] for i in range(len(feature_names)) if fit.support_[i]]
print(f"Selected Features: {selected_features}")


Selected Features: ['Feature 0', 'Feature 1', 'Feature 3']
