In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

In [3]:
# Load the CSV files
real_profiles = pd.read_csv('real_profiles.csv')  # Load real profiles
fake_profiles = pd.read_csv('fake_profiles.csv')  # Load fake profiles

# Add a label column to distinguish real and fake profiles
real_profiles['is_fake'] = 0  # 0 for real profiles
fake_profiles['is_fake'] = 1  # 1 for fake profiles

# Combine the two datasets into one
data = pd.concat([real_profiles, fake_profiles], ignore_index=True)

# Display the first few rows of the combined dataset
data.head()

Unnamed: 0,id,name,screen_name,fav_number,statuses_count,followers_count,friends_count,favourites_count,listed_count,created_at,...,profile_background_image_url,profile_background_color,profile_link_color,utc_offset,protected,verified,description,updated,dataset,is_fake
0,3610511,Davide Dellacasa,braddd,0,20370,5470,2385,145,52,Fri Apr 06 10:58:22 +0000 2007,...,http://a0.twimg.com/profile_background_images/...,BADFCD,FF0000,3600.0,,,Founder of http://www.screenweek.it & http://w...,2/14/2015 10:54,E13,0
1,5656162,Simone Economo,eKoeS,68,3131,506,381,9,40,Mon Apr 30 15:08:42 +0000 2007,...,http://a0.twimg.com/images/themes/theme1/bg.png,C0DEED,0084B4,3600.0,,,BSc degree (cum laude) in Computer Engineering...,2/14/2015 10:54,E13,0
2,5682702,tacone,tacone_,7696,4024,264,87,323,16,Tue May 01 11:53:40 +0000 2007,...,http://a0.twimg.com/profile_background_images/...,1A1B1F,2FC2EF,3600.0,,,Cogito ergo bestemmio.,2/14/2015 10:54,E13,0
3,6067292,alesaura,alesstar,202,40586,640,622,1118,32,Tue May 15 16:55:16 +0000 2007,...,http://a0.twimg.com/images/themes/theme4/bg.gif,0099B9,0099B9,3600.0,,,"Se la vita ti dà sarde, scapocciale!",2/14/2015 10:54,E13,0
4,6015122,Angelo,PerDiletto,37318,2016,62,64,13,0,Sun May 13 19:52:00 +0000 2007,...,http://a0.twimg.com/images/themes/theme18/bg.gif,ACDED6,38543,3600.0,,,Je me souviens,2/14/2015 10:54,E13,0


In [4]:
# Select relevant features
features = data[['fav_number', 'statuses_count', 'followers_count', 'friends_count', 
                 'favourites_count', 'listed_count', 'utc_offset', 'protected', 'verified']]
labels = data['is_fake']

# Display the first few rows of the selected features
features.head()

Unnamed: 0,fav_number,statuses_count,followers_count,friends_count,favourites_count,listed_count,utc_offset,protected,verified
0,0,20370,5470,2385,145,52,3600.0,,
1,68,3131,506,381,9,40,3600.0,,
2,7696,4024,264,87,323,16,3600.0,,
3,202,40586,640,622,1118,32,3600.0,,
4,37318,2016,62,64,13,0,3600.0,,


In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Print the shapes of the training and testing sets
print("Training features shape:", X_train.shape)
print("Testing features shape:", X_test.shape)

Training features shape: (2254, 9)
Testing features shape: (564, 9)


In [6]:
# Train the model
model = RandomForestClassifier()  # Use a Random Forest model
model.fit(X_train, y_train)  # Train the model on the training data

print("Model trained successfully!")

Model trained successfully!


In [7]:
# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 99.47%


In [8]:
# Save the model to a file
joblib.dump(model, 'fake_profile_detector.pkl')
print("Model saved as 'fake_profile_detector.pkl'")

Model saved as 'fake_profile_detector.pkl'


In [14]:
# Load the saved model
model = joblib.load('fake_profile_detector.pkl')

# New profile data (replace with actual values)
new_profile = pd.DataFrame({
    'fav_number': [7],
    'statuses_count': [100],
    'followers_count': [50],
    'friends_count': [200],
    'favourites_count': [10],
    'listed_count': [2],
    'utc_offset': [-360],
    'protected': [0],
    'verified': [0]
})

# Make a prediction
prediction = model.predict(new_profile)
print("Fake" if prediction[0] == 1 else "Real")

Real


In [12]:
# Example test profiles
test_profiles = [
    {
        'fav_number': 42,
        'statuses_count': 500,
        'followers_count': 1000,
        'friends_count': 300,
        'favourites_count': 200,
        'listed_count': 10,
        'utc_offset': -360,
        'protected': 0,
        'verified': 1
    },
    {
        'fav_number': 7,
        'statuses_count': 100,
        'followers_count': 200,
        'friends_count': 150,
        'favourites_count': 50,
        'listed_count': 5,
        'utc_offset': 0,
        'protected': 1,
        'verified': 0
    },
    {
        'fav_number': 13,
        'statuses_count': 1000,
        'followers_count': 5000,
        'friends_count': 1000,
        'favourites_count': 1000,
        'listed_count': 50,
        'utc_offset': 180,
        'protected': 0,
        'verified': 1
    },
    {
        'fav_number': 0,
        'statuses_count': 10,
        'followers_count': 5,
        'friends_count': 99999,
        'favourites_count': 0,
        'listed_count': 0,
        'utc_offset': 0,
        'protected': 0,
        'verified': 0
    },
    {
    'fav_number': 0,
    'statuses_count': 10,
    'followers_count': 5,
    'friends_count': 2,
    'favourites_count': 0,
    'listed_count': 0,
    'utc_offset': 0,
    'protected': 0,
    'verified': 0
}
]

# Convert to DataFrame
test_profiles_df = pd.DataFrame(test_profiles)

# Load the saved model
model = joblib.load('fake_profile_detector.pkl')

# Make predictions
predictions = model.predict(test_profiles_df)

# Display the predictions
for i, prediction in enumerate(predictions):
    print(f"Profile {i+1}: {'Fake' if prediction == 1 else 'Real'}")

Profile 1: Real
Profile 2: Real
Profile 3: Real
Profile 4: Real
Profile 5: Real
