In [2]:
import pandas as pd

# Load the dataset
file_path = 'data/ObesityDataSet_raw_and_data_sinthetic.csv'
df = pd.read_csv(file_path)

# Display basic info about the dataset
print(df.info())

# Show the first few rows to understand its structure
print(df.head())

# Find full duplicate entries
duplicates = df[df.duplicated(keep=False)] # keep=False shows all instances of duplicates

# Display the number of full duplicates and the duplicate entries
print(f"\nNumber of full duplicate rows: {duplicates.shape[0]}")
print("\nFull duplicate entries:")
print(duplicates)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   object 
 12  FAF                             21

In [4]:
print(df.duplicated().sum())

# Find all full duplicate rows (including the original occurrences)
duplicates = df[df.duplicated(keep=False)]

# Print all duplicate entries
print(f"Number of full duplicates: {duplicates.shape[0]}")
print("\nFull duplicate entries:")
print(duplicates)



24
Number of full duplicates: 33

Full duplicate entries:
     Gender   Age  Height  Weight family_history_with_overweight FAVC  FCVC   
97   Female  21.0    1.52    42.0                             no   no   3.0  \
98   Female  21.0    1.52    42.0                             no   no   3.0   
105  Female  25.0    1.57    55.0                             no  yes   2.0   
106  Female  25.0    1.57    55.0                             no  yes   2.0   
145    Male  21.0    1.62    70.0                             no  yes   2.0   
174    Male  21.0    1.62    70.0                             no  yes   2.0   
179    Male  21.0    1.62    70.0                             no  yes   2.0   
184    Male  21.0    1.62    70.0                             no  yes   2.0   
208  Female  22.0    1.69    65.0                            yes  yes   2.0   
209  Female  22.0    1.69    65.0                            yes  yes   2.0   
282  Female  18.0    1.62    55.0                            yes  yes   2

In [10]:
import React, { useState, useEffect } from 'react';
import { BarChart, Bar, XAxis, YAxis, CartesianGrid, Tooltip, Legend, ResponsiveContainer, PieChart, Pie, Cell, ScatterPlot, ScatterChart, ZAxis, Scatter } from 'recharts';
import Papa from 'papaparse';
import _ from 'lodash';

const ObesityDashboard = () => {
  const [data, setData] = useState([]);
  const [loading, setLoading] = useState(true);
  const [numericalCorrelations, setNumericalCorrelations] = useState([]);
  const [weightByCategoryData, setWeightByCategoryData] = useState([]);
  const [familyHistoryData, setFamilyHistoryData] = useState([]);
  const [physicalActivityData, setPhysicalActivityData] = useState([]);
  const [transportData, setTransportData] = useState([]);
  const [nutritionData, setNutritionData] = useState([]);

  useEffect(() => {
    const fetchData = async () => {
      try {
        const response = await window.fs.readFile('Preprocessed_ObesityDataSet_raw_and_data_sinthetic.csv');
        const text = new TextDecoder().decode(response);
        
        const parsedData = Papa.parse(text, {
          header: true,
          dynamicTyping: true,
          skipEmptyLines: true
        });
        
        setData(parsedData.data);
        processData(parsedData.data);
        setLoading(false);
      } catch (error) {
        console.error('Error reading file:', error);
        setLoading(false);
      }
    };

    fetchData();
  }, []);

  const processData = (data) => {
    // Define obesity levels for ordering
    const obesityOrder = {
      "Insufficient_Weight": 1,
      "Normal_Weight": 2,
      "Overweight_Level_I": 3,
      "Overweight_Level_II": 4,
      "Obesity_Type_I": 5,
      "Obesity_Type_II": 6,
      "Obesity_Type_III": 7
    };
    
    // Add numerical obesity level and BMI
    data.forEach(row => {
      row.ObesityLevel = obesityOrder[row.NObeyesdad] || 0;
      row.BMI = row.Weight / ((row.Height / 100) * (row.Height / 100));
    });

    // Calculate correlations with obesity
    const numericalFeatures = ['Age', 'Weight', 'Height', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE'];
    const correlations = numericalFeatures.map(feature => {
      const correlation = calculateCorrelation(data, feature, 'ObesityLevel');
      return {
        name: featureNameMapping(feature),
        correlation: parseFloat(correlation.toFixed(4)),
        absoluteCorrelation: Math.abs(parseFloat(correlation.toFixed(4)))
      };
    });
    
    setNumericalCorrelations(correlations.sort((a, b) => b.absoluteCorrelation - a.absoluteCorrelation));

    // Weight by obesity category
    const weightByCategory = _.chain(data)
      .groupBy('NObeyesdad')
      .map((group, category) => ({
        name: formatCategoryName(category),
        avgWeight: parseFloat(_.meanBy(group, 'Weight').toFixed(2)),
        count: group.length,
        order: obesityOrder[category]
      }))
      .sortBy('order')
      .value();
    
    setWeightByCategoryData(weightByCategory);

    // Family history analysis
    const familyHistoryDistribution = processCategoricalFeature(data, 'family_history_with_overweight');
    setFamilyHistoryData(familyHistoryDistribution);

    // Physical activity
    const fafGroups = [
      { min: 0, max: 1, label: 'No activity' },
      { min: 1, max: 2, label: 'Light activity' },
      { min: 2, max: 3, label: 'Moderate activity' },
      { min: 3, max: 4, label: 'High activity' }
    ];

    const physicalActivity = _.chain(data)
      .groupBy(row => {
        const group = fafGroups.find(g => row.FAF >= g.min && (row.FAF < g.max || (g.max === 4 && row.FAF <= g.max)));
        return group ? group.label : 'Unknown';
      })
      .map((group, label) => {
        const obesityCounts = _.countBy(group, 'NObeyesdad');
        const total = group.length;
        return {
          name: label,
          totalCount: total,
          obesityRate: parseFloat((_.filter(group, row => 
            row.NObeyesdad === 'Obesity_Type_I' || 
            row.NObeyesdad === 'Obesity_Type_II' || 
            row.NObeyesdad === 'Obesity_Type_III').length / total * 100).toFixed(2))
        };
      })
      .sortBy('obesityRate')
      .reverse()
      .value();
    
    setPhysicalActivityData(physicalActivity);

    // Transportation method
    const transportDistribution = processCategoricalFeature(data, 'MTRANS');
    setTransportData(transportDistribution);

    // Nutrition-related factors (FAVC - high caloric food, FCVC - vegetable consumption)
    const favcData = processCategoricalFeature(data, 'FAVC');
    const fcvcGroups = [
      { min: 1, max: 2, label: 'Low vegetable consumption' },
      { min: 2, max: 3, label: 'Moderate vegetable consumption' },
      { min: 3, max: 4, label: 'High vegetable consumption' }
    ];

    const fcvcAnalysis = _.chain(data)
      .groupBy(row => {
        const group = fcvcGroups.find(g => row.FCVC >= g.min && (row.FCVC < g.max || (g.max === 4 && row.FCVC <= g.max)));
        return group ? group.label : 'Unknown';
      })
      .map((group, label) => {
        const obesityCounts = _.countBy(group, 'NObeyesdad');
        const total = group.length;
        return {
          name: label,
          totalCount: total,
          obesityRate: parseFloat((_.filter(group, row => 
            row.NObeyesdad === 'Obesity_Type_I' || 
            row.NObeyesdad === 'Obesity_Type_II' || 
            row.NObeyesdad === 'Obesity_Type_III').length / total * 100).toFixed(2))
        };
      })
      .value();
    
    // Combine FAVC and FCVC data
    setNutritionData([...favcData, ...fcvcAnalysis]);
  };

  // Utility functions
  const calculateCorrelation = (data, xKey, yKey) => {
    const validData = data.filter(row => 
      row[xKey] !== null && row[xKey] !== undefined && 
      row[yKey] !== null && row[yKey] !== undefined
    );
    
    const xMean = _.meanBy(validData, xKey);
    const yMean = _.meanBy(validData, yKey);
    
    let numerator = 0;
    let xDenominator = 0;
    let yDenominator = 0;
    
    validData.forEach(row => {
      const xDiff = row[xKey] - xMean;
      const yDiff = row[yKey] - yMean;
      numerator += xDiff * yDiff;
      xDenominator += xDiff * xDiff;
      yDenominator += yDiff * yDiff;
    });
    
    return numerator / Math.sqrt(xDenominator * yDenominator);
  };

  const processCategoricalFeature = (data, feature) => {
    return _.chain(data)
      .groupBy(feature)
      .map((group, category) => {
        const obesityCounts = _.countBy(group, 'NObeyesdad');
        const total = group.length;
        return {
          name: formatCategoryName(category),
          totalCount: total,
          obesityRate: parseFloat((_.filter(group, row => 
            row.NObeyesdad === 'Obesity_Type_I' || 
            row.NObeyesdad === 'Obesity_Type_II' || 
            row.NObeyesdad === 'Obesity_Type_III').length / total * 100).toFixed(2))
        };
      })
      .sortBy('obesityRate')
      .reverse()
      .value();
  };

  const formatCategoryName = (category) => {
    if (!category) return 'Unknown';
    
    return category
      .replace(/_/g, ' ')
      .split(' ')
      .map(word => word.charAt(0).toUpperCase() + word.slice(1).toLowerCase())
      .join(' ');
  };

  const featureNameMapping = (feature) => {
    const mapping = {
      'Age': 'Age',
      'Weight': 'Weight',
      'Height': 'Height',
      'FCVC': 'Vegetable Consumption',
      'NCP': '# of Main Meals',
      'CH2O': 'Water Consumption',
      'FAF': 'Physical Activity',
      'TUE': 'Technology Use Time'
    };
    return mapping[feature] || feature;
  };

  const COLORS = ['#0088FE', '#00C49F', '#FFBB28', '#FF8042', '#8884d8', '#82ca9d', '#ffc658'];

  if (loading) {
    return <div className="p-4">Loading data...</div>;
  }

  return (
    <div className="p-4 space-y-8">
      <h1 className="text-2xl font-bold">Obesity Correlation Analysis</h1>
      
      <div className="grid grid-cols-1 md:grid-cols-2 gap-6">
        <div className="bg-white p-4 rounded shadow">
          <h2 className="text-lg font-semibold mb-4">Correlation of Factors with Obesity Level</h2>
          <ResponsiveContainer width="100%" height={300}>
            <BarChart
              data={numericalCorrelations}
              layout="vertical"
              margin={{ top: 5, right: 30, left: 100, bottom: 5 }}
            >
              <CartesianGrid strokeDasharray="3 3" />
              <XAxis type="number" domain={[-1, 1]} />
              <YAxis dataKey="name" type="category" width={100} />
              <Tooltip />
              <Legend />
              <Bar dataKey="correlation" fill="#8884d8" />
            </BarChart>
          </ResponsiveContainer>
        </div>

        <div className="bg-white p-4 rounded shadow">
          <h2 className="text-lg font-semibold mb-4">Average Weight by Obesity Category</h2>
          <ResponsiveContainer width="100%" height={300}>
            <BarChart
              data={weightByCategoryData}
              margin={{ top: 5, right: 30, left: 20, bottom: 70 }}
            >
              <CartesianGrid strokeDasharray="3 3" />
              <XAxis dataKey="name" angle={-45} textAnchor="end" height={70} />
              <YAxis label={{ value: 'Weight (kg)', angle: -90, position: 'insideLeft' }} />
              <Tooltip />
              <Bar dataKey="avgWeight" fill="#82ca9d" />
            </BarChart>
          </ResponsiveContainer>
        </div>

        <div className="bg-white p-4 rounded shadow">
          <h2 className="text-lg font-semibold mb-4">Obesity Rate by Family History</h2>
          <ResponsiveContainer width="100%" height={300}>
            <BarChart
              data={familyHistoryData}
              margin={{ top: 5, right: 30, left: 20, bottom: 5 }}
            >
              <CartesianGrid strokeDasharray="3 3" />
              <XAxis dataKey="name" />
              <YAxis label={{ value: 'Obesity Rate (%)', angle: -90, position: 'insideLeft' }} />
              <Tooltip />
              <Bar dataKey="obesityRate" fill="#ff8042" />
            </BarChart>
          </ResponsiveContainer>
        </div>

        <div className="bg-white p-4 rounded shadow">
          <h2 className="text-lg font-semibold mb-4">Obesity Rate by Physical Activity Level</h2>
          <ResponsiveContainer width="100%" height={300}>
            <BarChart
              data={physicalActivityData}
              margin={{ top: 5, right: 30, left: 20, bottom: 5 }}
            >
              <CartesianGrid strokeDasharray="3 3" />
              <XAxis dataKey="name" />
              <YAxis label={{ value: 'Obesity Rate (%)', angle: -90, position: 'insideLeft' }} />
              <Tooltip />
              <Bar dataKey="obesityRate" fill="#8884d8" />
            </BarChart>
          </ResponsiveContainer>
        </div>

        <div className="bg-white p-4 rounded shadow">
          <h2 className="text-lg font-semibold mb-4">Obesity Rate by Transportation Method</h2>
          <ResponsiveContainer width="100%" height={300}>
            <BarChart
              data={transportData}
              margin={{ top: 5, right: 30, left: 20, bottom: 70 }}
            >
              <CartesianGrid strokeDasharray="3 3" />
              <XAxis dataKey="name" angle={-45} textAnchor="end" height={70} />
              <YAxis label={{ value: 'Obesity Rate (%)', angle: -90, position: 'insideLeft' }} />
              <Tooltip />
              <Bar dataKey="obesityRate" fill="#00C49F" />
            </BarChart>
          </ResponsiveContainer>
        </div>

        <div className="bg-white p-4 rounded shadow">
          <h2 className="text-lg font-semibold mb-4">Obesity Rate by Nutritional Factors</h2>
          <ResponsiveContainer width="100%" height={300}>
            <BarChart
              data={nutritionData}
              margin={{ top: 5, right: 30, left: 20, bottom: 70 }}
            >
              <CartesianGrid strokeDasharray="3 3" />
              <XAxis dataKey="name" angle={-45} textAnchor="end" height={70} />
              <YAxis label={{ value: 'Obesity Rate (%)', angle: -90, position: 'insideLeft' }} />
              <Tooltip />
              <Bar dataKey="obesityRate" fill="#FFBB28" />
            </BarChart>
          </ResponsiveContainer>
        </div>
      </div>
    </div>
  );
};

export default ObesityDashboard;

SyntaxError: invalid syntax (2507518402.py, line 1)