In [0]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'datasets-for-pandas:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F71132%2F151729%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240921%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240921T144542Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Dae9959a3a31884f0053128a808530bcc5846912c78c807e315daf93f8ba1441ab2b0a3cfedea5cfb59b09f2cde7c7eeade331359ceba424ba22ce705c60e998468e01a216f6ef18fc5c80965e4ec19162e295acbc1190590a4201953ce8238156e1c642e9ce8e4a343a658fd6486d47c6c3fb4b5e056e1101f0ac91ffd81e798a955d19b06cdad26598e2c43c73279f2dd63f4f4a17b67e51d3191c467aae47026db4ba11a5aaea69b7beb2da8b5e622f095b378055d7cf93f0f5bd24710cbc634db25526da91293bbb142774f9715c7fedd06718ca051603a1ad5f3eeac0425ca3d4a640e7a7966b1093d62d092796594297d1afc2beee4d439216c0f8fd1b5'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
score = pd.read_csv('/kaggle/input/datasets-for-pandas/abc.csv' , encoding='ISO-8859-1')

Got it! Here are some useful and important tasks/questions for working with your CSV file using Pandas:

1. **Basic Overview**:
   - How do you load the CSV file into a Pandas DataFrame?
   - How would you check the first few rows of the dataset?

2. **Data Inspection**:
   - How do you get a summary of statistics for each subject (e.g., mean, min, max, etc.)?
   - How can you check the data types of the columns?

3. **Data Filtering**:
   - How can you filter students who scored above 80 in language?
   - How would you filter students who scored below 50 in any subject?

4. **Data Sorting**:
   - How do you sort the data by the science column in descending order?

5. **Data Aggregation**:
   - How would you calculate the average score for each subject?
   - How do you find the student with the highest total score across all subjects?

6. **Missing Data**:
   - How can you check for missing values in the dataset?

7. **Visualization**:
   - How would you plot a bar chart showing each student's total score across subjects?

Which ones would you like to explore or answer first?

In [None]:
score.head(5)

In [None]:
# How do you get a summary of statistics for each subject (e.g., mean, min, max, etc.)?
scienceM = score['science'].mean()
mathsM = score['maths'].mean()
historyM = score['history'].mean()
print(f"The Mean of Subjects are S : {scienceM:.2f} , M : {mathsM:.2f} & H: {historyM:.2f}")

In [None]:
scienceMax = score['science'].max()
mathsMin = score['maths'].min()
print(f"The S : {scienceMax}  and M: {mathsMin}")

In [None]:
score.columns

In [None]:
score.info()

In [None]:
# How can you filter students who scored above 80 in language?
score[score['language'] > 80]  # here i grab the entire data frame

In [None]:
# but if i want to grab the only names
score[score['language'] > 80]['ï»¿student']

In [None]:
# How would you filter students who scored below 50 in any subject?
score[(score['science'] < 50) | (score['maths'] < 50) | (score['history'] < 50) ]

In [None]:
score

In [None]:
# How do you sort the data by the science column in descending order?
score.sort_values('science' ,  ascending = False )

In [None]:
s = score['science'].apply(lambda x : x == 90)
score[s]['ï»¿student']

In [None]:
score.dropna()  # no values 

In [None]:
# Visualization:
# just creating line or curve
s = score['science'] 
plt.plot(s)

In [None]:
s

In [None]:
# Visualization:

# How would you plot a bar chart showing each student's total score across subjects?
