<a href="https://colab.research.google.com/github/Shikshith05/ImpactEcho/blob/main/impactechomodel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

def generate_impact_data(num_records=10000, fake_rate=0.1):
    """
    Generate synthetic NGO impact verification data for ImpactEcho AI.

    Args:
        num_records (int): Number of total records to generate.
        fake_rate (float): Fraction of fake or manipulated records.

    Returns:
        pandas.DataFrame: Generated dataset with authenticity labels.
    """

    num_fake = int(num_records * fake_rate)
    num_real = num_records - num_fake

    print(f"Generating {num_records:,} NGO impact records ({num_real:,} real, {num_fake:,} fake)")

    ngo_ids = [f"NGO-{i:03d}" for i in range(1, 51)]
    project_types = ["Tree Plantation", "Education Drive", "Health Camp", "Clean Water", "Women Empowerment", "Food Distribution"]
    locations = ["Delhi", "Bangalore", "Hyderabad", "Mumbai", "Kolkata", "Chennai", "Pune"]

    real_data, fake_data = [], []
    base_time = datetime(2025, 9, 1, 10, 0, 0)

    # ✅ Generate real records
    for _ in range(num_real):
        ngo_id = random.choice(ngo_ids)
        project = random.choice(project_types)
        location = random.choice(locations)
        timestamp = base_time + timedelta(days=random.randint(0, 60))

        beneficiaries_reported = random.randint(80, 500)
        verified_beneficiaries = beneficiaries_reported - random.randint(0, 10)
        impact_score = round(random.uniform(0.8, 1.0), 2)
        funding_amount = random.randint(50000, 300000)

        media_link = f"https://impactecho.org/media/{ngo_id}_{project.replace(' ', '_')}.jpg"

        real_data.append([
            timestamp.strftime("%Y-%m-%d %H:%M:%S"),
            ngo_id,
            project,
            location,
            media_link,
            beneficiaries_reported,
            verified_beneficiaries,
            funding_amount,
            impact_score,
            0  # is_fake
        ])

    # Generate fake records
    for _ in range(num_fake):
        ngo_id = random.choice(ngo_ids + [f"FAKE-{i:03d}" for i in range(51, 61)])
        project = random.choice(project_types + ["Ghost Project", "Phantom Relief"])
        location = random.choice(locations + ["Unknown", "Remote Village", "N/A"])
        timestamp = base_time + timedelta(days=random.randint(0, 60))

        # Fake or inconsistent metrics
        beneficiaries_reported = random.randint(100, 1000)
        verified_beneficiaries = max(0, beneficiaries_reported - random.randint(100, 900))
        impact_score = round(random.uniform(0.0, 0.4), 2)
        funding_amount = random.randint(100000, 400000)

        # Broken or reused media
        media_link = random.choice([
            f"https://impactecho.org/media/fake_{random.randint(1,999)}.jpg",
            "https://imgur.com/fakeproof123",
            "https://drive.google.com/brokenlink"
        ])

        fake_data.append([
            timestamp.strftime("%Y-%m-%d %H:%M:%S"),
            ngo_id,
            project,
            location,
            media_link,
            beneficiaries_reported,
            verified_beneficiaries,
            funding_amount,
            impact_score,
            1  # is_fake
        ])

    # Combine + shuffle
    df = pd.DataFrame(real_data + fake_data, columns=[
        "timestamp", "ngo_id", "project_type", "location", "media_link",
        "reported_beneficiaries", "verified_beneficiaries", "funding_amount",
        "impact_score", "is_fake"
    ])

    df = df.sample(frac=1).reset_index(drop=True)

    print(f"\n Data generation complete!")
    print(f"Total records: {len(df):,}")
    print(f"Fake records: {df['is_fake'].sum():,}")
    print(f"Fake rate: {df['is_fake'].mean() * 100:.2f}%")

    return df


def save_to_json(df, filename="impact_data"):
    df.to_json(f"{filename}.json", orient="records", indent=2)
    df.to_json(f"{filename}.jsonl", orient="records", lines=True)
    print(f" Saved to {filename}.json and {filename}.jsonl")


def display_sample(df, n=5):
    print(f"\nShowing {n} random samples:\n")
    print(df.sample(n).to_string(index=False))
    print("\n Summary:\n", df.describe())


if __name__ == "_main_":
    impact_df = generate_impact_data(num_records=5000, fake_rate=0.1)
    display_sample(impact_df)
    save_to_json(impact_df, "impact_data_full")

    # Split for training/testing AI model
    train_df = impact_df.sample(frac=0.8, random_state=42)
    test_df = impact_df.drop(train_df.index)

    save_to_json(train_df, "impact_data_train")
    save_to_json(test_df, "impact_data_test")

    print("\nTraining records:", len(train_df))
    print("Testing records:", len(test_df))
    print("\nImpactEcho synthetic dataset created successfully!")

In [2]:
impact_df = generate_impact_data(num_records=5000, fake_rate=0.1)
display_sample(impact_df)

Generating 5,000 NGO impact records (4,500 real, 500 fake)

 Data generation complete!
Total records: 5,000
Fake records: 500
Fake rate: 10.00%

Showing 5 random samples:

          timestamp   ngo_id      project_type  location                                               media_link  reported_beneficiaries  verified_beneficiaries  funding_amount  impact_score  is_fake
2025-09-20 10:00:00 FAKE-058 Food Distribution    Mumbai                           https://imgur.com/fakeproof123                     702                     368          254065          0.16        1
2025-10-16 10:00:00  NGO-002   Tree Plantation Hyderabad https://impactecho.org/media/NGO-002_Tree_Plantation.jpg                     111                     106          274595          0.93        0
2025-09-04 10:00:00  NGO-007   Education Drive     Delhi https://impactecho.org/media/NGO-007_Education_Drive.jpg                     500                     498          128658          0.94        0
2025-10-15 10:00:00  NGO

In [4]:
from sklearn.preprocessing import LabelEncoder

# Copy dataframe to avoid modifying original
df = impact_df.copy()

# 1️⃣ Derived feature
df['beneficiary_gap'] = df['reported_beneficiaries'] - df['verified_beneficiaries']

# 2️⃣ Features & labels
features = [
    'reported_beneficiaries',
    'verified_beneficiaries',
    'beneficiary_gap',
    'funding_amount',
    'impact_score',
    'ngo_id',
    'project_type',
    'location'
]
X = df[features]
y = df['is_fake']

# 3️⃣ Encode categorical columns
categorical_cols = ['ngo_id', 'project_type', 'location']
encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    encoders[col] = le

# Check preprocessed data
print("Preprocessed feature sample:")
print(X.head())


Preprocessed feature sample:
   reported_beneficiaries  verified_beneficiaries  beneficiary_gap  \
0                     434                     427                7   
1                     138                     131                7   
2                     926                     410              516   
3                     211                     203                8   
4                     307                     306                1   

   funding_amount  impact_score  ngo_id  project_type  location  
0          200986          0.85      25             7         2  
1          275632          0.86      59             4         5  
2          212751          0.40       5             1         1  
3          280288          0.98      42             2         5  
4          120971          0.81      39             0         4  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col])


In [5]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#  Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 3️Build simple NN
model = tf.keras.Sequential([
    tf.keras.layers.Dense(32, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')  # Output: probability of being fake
])

# Compile model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train model
history = model.fit(X_train_scaled, y_train,
                    epochs=20,
                    batch_size=32,
                    validation_split=0.2)

# Evaluate
test_loss, test_acc = model.evaluate(X_test_scaled, y_test)
print("\nTest Accuracy:", test_acc)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.7207 - loss: 0.5434 - val_accuracy: 0.9962 - val_loss: 0.1163
Epoch 2/20
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9980 - loss: 0.0693 - val_accuracy: 0.9987 - val_loss: 0.0217
Epoch 3/20
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9996 - loss: 0.0153 - val_accuracy: 1.0000 - val_loss: 0.0086
Epoch 4/20
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 1.0000 - loss: 0.0072 - val_accuracy: 1.0000 - val_loss: 0.0045
Epoch 5/20
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 1.0000 - loss: 0.0036 - val_accuracy: 1.0000 - val_loss: 0.0028
Epoch 6/20
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 1.0000 - loss: 0.0025 - val_accuracy: 1.0000 - val_loss: 0.0019
Epoch 7/20
[1m100/100[0m 

In [6]:
import numpy as np

# Example new record
new_record = {
    'reported_beneficiaries': 300,
    'verified_beneficiaries': 290,
    'beneficiary_gap': 10,
    'funding_amount': 150000,
    'impact_score': 0.95,
    'ngo_id': 'NGO-001',
    'project_type': 'Tree Plantation',
    'location': 'Delhi'
}

# Convert to DataFrame
new_df = pd.DataFrame([new_record])

# Encode categorical features using the same LabelEncoders
for col in ['ngo_id', 'project_type', 'location']:
    new_df[col] = encoders[col].transform(new_df[col])

# Scale numeric features using the same scaler
new_scaled = scaler.transform(new_df)

# Predict
prediction = model.predict(new_scaled)
prob = model.predict_proba(new_scaled)[0][0] if hasattr(model, "predict_proba") else prediction[0]

print("Predicted class (0=real, 1=fake):", int(prediction[0] > 0.5))
print("Probability of being fake:", float(prob))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 308ms/step
Predicted class (0=real, 1=fake): 0
Probability of being fake: 2.1979194571031258e-05


  print("Predicted class (0=real, 1=fake):", int(prediction[0] > 0.5))
  print("Probability of being fake:", float(prob))


In [7]:
# Example real NGO record
real_record = {
    'reported_beneficiaries': 250,
    'verified_beneficiaries': 245,
    'beneficiary_gap': 5,
    'funding_amount': 120000,
    'impact_score': 0.92,
    'ngo_id': 'NGO-007',
    'project_type': 'Education Drive',
    'location': 'Bangalore'
}

# Convert to DataFrame
new_df = pd.DataFrame([real_record])

# Encode categorical features using the same LabelEncoders
for col in ['ngo_id', 'project_type', 'location']:
    new_df[col] = encoders[col].transform(new_df[col])

# Scale numeric features
new_scaled = scaler.transform(new_df)

# Predict
prediction = model.predict(new_scaled)
prob = prediction[0][0]  # TensorFlow outputs an array

print("Predicted class (0=real, 1=fake):", int(prediction[0][0] > 0.5))
print("Probability of being fake:", float(prob))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
Predicted class (0=real, 1=fake): 0
Probability of being fake: 1.0610242497932632e-05


In [8]:
fake_records = [
    # huge gap, low impact score, high funding
    {
        'reported_beneficiaries': 900,
        'verified_beneficiaries': 50,
        'beneficiary_gap': 850,
        'funding_amount': 350000,
        'impact_score': 0.05,
        'ngo_id': 'NGO-012',
        'project_type': 'Food Distribution',
        'location': 'Mumbai'
    },
    # reported high but verified zero, tiny impact score
    {
        'reported_beneficiaries': 700,
        'verified_beneficiaries': 0,
        'beneficiary_gap': 700,
        'funding_amount': 300000,
        'impact_score': 0.02,
        'ngo_id': 'NGO-021',
        'project_type': 'Health Camp',
        'location': 'Delhi'
    },
    # moderate reported but verified very low, suspicious funding
    {
        'reported_beneficiaries': 450,
        'verified_beneficiaries': 20,
        'beneficiary_gap': 430,
        'funding_amount': 280000,
        'impact_score': 0.10,
        'ngo_id': 'NGO-033',
        'project_type': 'Education Drive',
        'location': 'Kolkata'
    },
    # extreme gap + low score (looks fabricated)
    {
        'reported_beneficiaries': 1000,
        'verified_beneficiaries': 10,
        'beneficiary_gap': 990,
        'funding_amount': 400000,
        'impact_score': 0.01,
        'ngo_id': 'NGO-005',
        'project_type': 'Tree Plantation',
        'location': 'Chennai'
    },
    # suspicious combination: medium reported but very low verified and low score
    {
        'reported_beneficiaries': 320,
        'verified_beneficiaries': 5,
        'beneficiary_gap': 315,
        'funding_amount': 200000,
        'impact_score': 0.08,
        'ngo_id': 'NGO-018',
        'project_type': 'Women Empowerment',
        'location': 'Pune'
    }
]

# Convert to DataFrame
fake_df = pd.DataFrame(fake_records)

# Encode categorical columns using the same LabelEncoders used during training
for col in ['ngo_id', 'project_type', 'location']:
    # If the encoder doesn't know a label, map to a fallback (most common label)
    le = encoders[col]
    def safe_transform(val):
        if val in le.classes_:
            return le.transform([val])[0]
        else:
            # fallback: use index 0 class (or you can choose another strategy)
            return 0
    fake_df[col] = fake_df[col].apply(safe_transform)

# Scale numeric features using the same scaler
fake_scaled = scaler.transform(fake_df)

# Predict with the trained TensorFlow model
preds = model.predict(fake_scaled)          # sigmoid outputs between 0 and 1
pred_labels = (preds > 0.5).astype(int).ravel()

# Show results
fake_df['predicted_prob_fake'] = preds.ravel()
fake_df['predicted_label'] = np.where(pred_labels==1, 'FAKE', 'REAL')

print("\nPredictions for fake-looking records:\n")
print(fake_df[['reported_beneficiaries','verified_beneficiaries','beneficiary_gap',
               'funding_amount','impact_score','ngo_id','project_type','location',
               'predicted_prob_fake','predicted_label']])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 275ms/step

Predictions for fake-looking records:

   reported_beneficiaries  verified_beneficiaries  beneficiary_gap  \
0                     900                      50              850   
1                     700                       0              700   
2                     450                      20              430   
3                    1000                      10              990   
4                     320                       5              315   

   funding_amount  impact_score  ngo_id  project_type  location  \
0          350000          0.05      21             2         5   
1          300000          0.02      30             4         2   
2          280000          0.10      42             1         4   
3          400000          0.01      14             6         1   
4          200000          0.08      27             7         7   

   predicted_prob_fake predicted_label  
0             1.000000