## Getting the Feature ranking

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

# Load your dataset
df = pd.read_csv('C:\\Users\\rp7248\\Desktop\\with_EM.csv')

# Separate the features and the target
X = df.iloc[:, 1:-1]  # assuming the first column is an ID and the last column is the target
y = df.iloc[:, -1]

# Create a RandomForest Classifier
model = RandomForestClassifier()

# Create the RFE object and rank each feature
# Here, we are selecting the top 10 features
rfe = RFE(estimator=model, n_features_to_select=15)
rfe.fit(X, y)

# Summarize the selection of the attributes
selected_features = X.columns[rfe.support_]
print('Selected features:', selected_features)

# Ranking of features
feature_ranking = pd.DataFrame({'Feature': X.columns, 'Ranking': rfe.ranking_}).sort_values(by='Ranking')
print(feature_ranking)


## understanding the if Ranking is correct

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_csv('C:\\Users\\rp7248\\Desktop\\Csv files\\With_Em_final2.csv')

features = [
    "Census_ProcessorModelIdentifier",
    "Census_OSVersion",
    "SmartScreen",
    "Census_OSBuildRevision",
    "Census_SystemVolumeTotalCapacity",
    "Census_InternalPrimaryDiagonalDisplaySizeInInches",
    "Census_OEMNameIdentifier",
    "GeoNameIdentifier",
    "LocaleEnglishNameIdentifier",
    "CityIdentifier",
    "CountryIdentifier",
    "Census_FirmwareVersionIdentifier",
    "AVProductStatesIdentifier",
    "AvSigVersion",
    "Census_OEMModelIdentifier",
    "OrganizationIdentifier",
    "Census_PrimaryDiskTotalCapacity",
    "AppVersion",
    "Census_OSUILocaleIdentifier",
    "Census_OSInstallTypeName",
    "Census_FirmwareManufacturerIdentifier",
    "Wdft_RegionIdentifier",
    "OsBuildLab",
    "Census_InternalPrimaryDisplayResolutionHorizontal",
    "Census_InternalBatteryNumberOfCharges",
    "Census_TotalPhysicalRAM",
    "Census_OSInstallLanguageIdentifier",
    "EngineVersion",
    "IeVerIdentifier",
    "AVProductsInstalled",
    "Census_OSEdition",
    "Census_ChassisTypeName",
    "Census_OSWUAutoUpdateOptionsName",
    "Census_InternalPrimaryDisplayResolutionVertical",
    "Census_InternalBatteryType",
    "Census_ProcessorCoreCount",
    "Census_ActivationChannel",
    "Census_OSBranch",
    "Census_OSSkuName",
    "Census_MDC2FormFactor",
    "Census_PrimaryDiskTypeName",
    "Wdft_IsGamer",
    "Census_IsSecureBootEnabled",
    "OsBuild",
    "Census_GenuineStateName",
    "Census_OSBuildNumber",
    "SkuEdition",
    "Census_PowerPlatformRoleName",
    "OsPlatformSubRelease",
    "Census_IsTouchEnabled",
    "OsSuite",
    "Census_ProcessorManufacturerIdentifier",
    "Census_ThresholdOptIn",
    "Census_IsFlightingInternal",
    "Census_IsWIMBootEnabled"
]

# Prepare an empty list to store the results
results = []

# remaining_features = df.columns[1:-1].tolist()
features_to_drop = [df.columns[0], df.columns[-1]]

# Loop over the features array
for feature in features:
    features_to_drop.append(feature)
    # Select the features and target
    X = df.drop(columns=features_to_drop)
    y = df.iloc[:, -1]

    print(f"Number of features used (iteration {feature}): {X.shape[1]}")
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Feature Scaling
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Create and train the model
    model = LogisticRegression(max_iter=100)
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Calculate the accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    # Print the current status
    print(f"Feature removed: {feature}")
    print(f"Model accuracy without '{feature}': {accuracy:.4f}")
    
    # Save the results
    results.append({
        'Removed Feature': feature,
        'Accuracy': accuracy,
        'Loss': 'N/A'  # Placeholder for loss
    })

# Convert the results to a DataFrame
results_df = pd.DataFrame(results)

# Save the results to a CSV file
results_df.to_csv('C:\\Users\\rp7248\\Desktop\\Csv_file\\model_results.csv', index=False)


In [None]:
results_df = pd.DataFrame(results)
results_df.to_csv('C:\\Users\\rp7248\\Desktop\\Csv files\\model_results.csv', index=False)


## Create the Final dataset

In [None]:
import pandas as pd

# List of column names to be extracted from the CSV file
columns_to_extract = [
    "Census_ProcessorModelIdentifier",
    "Census_OSVersion",
    "SmartScreen",
    "Census_OSBuildRevision",
    "Census_SystemVolumeTotalCapacity",
    "Census_InternalPrimaryDiagonalDisplaySizeInInches",
    "Census_OEMNameIdentifier",
    "GeoNameIdentifier",
    "LocaleEnglishNameIdentifier",
    "CityIdentifier",
    "CountryIdentifier",
    "Census_FirmwareVersionIdentifier",
    "AVProductStatesIdentifier",
    "AvSigVersion",
    "Census_OEMModelIdentifier",
    "OrganizationIdentifier",
    "Census_PrimaryDiskTotalCapacity",
    "AppVersion",
    "Census_OSUILocaleIdentifier",
    "Census_OSInstallTypeName",
    "Census_FirmwareManufacturerIdentifier",
    "Wdft_RegionIdentifier",
    "OsBuildLab",
    "Census_InternalPrimaryDisplayResolutionHorizontal",
    "Census_InternalBatteryNumberOfCharges"
]

# Path to the CSV file
csv_file_path = 'C:\\Users\\rp7248\\Desktop\\Csv files\\With_Em_final2.csv'

# Read the CSV file
df = pd.read_csv(csv_file_path)

first_column = df.columns[0]
last_column = df.columns[-1]
columns_to_keep = [first_column] + columns_to_extract + [last_column]

# Create a new dataframe with only the selected columns
new_df = df[columns_to_keep]

results_df = pd.DataFrame(new_df)

# Save the results to a CSV file
results_df.to_csv('C:\\Users\\rp7248\\Desktop\\Csv files\\final_dataset.csv', index=False)

# Display the first few rows of the new dataframe
new_df.head()
