# Csv Loader

In [1]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Number of samples
n = 20

# Generate random housing features
area = np.random.randint(500, 3000, n)           # Square feet
bedrooms = np.random.randint(1, 6, n)            # Number of bedrooms
bathrooms = np.random.randint(1, 4, n)           # Number of bathrooms
age = np.random.randint(0, 30, n)                # Age of house (years)

# Generate price (output column) using a simple formula
price = (
    area * 1500 +
    bedrooms * 50000 +
    bathrooms * 30000 -
    age * 10000 +
    np.random.randint(-50000, 50000, n)  # noise
)

# Create DataFrame
df = pd.DataFrame({
    "Area_sqft": area,
    "Bedrooms": bedrooms,
    "Bathrooms": bathrooms,
    "Age": age,
    "Price": price
})

# Save to CSV
df.to_csv("housing_prices.csv", index=False)

print("CSV file 'housing_prices.csv' created successfully!")
print(df.head())

CSV file 'housing_prices.csv' created successfully!
   Area_sqft  Bedrooms  Bathrooms  Age    Price
0       1360         1          1   29  1871387
1       1794         4          3    5  2882016
2       1630         2          2   21  2434789
3       1595         5          1    9  2588091
4       2138         4          2    3  3476812


# Code 

In [2]:
from langchain_community.document_loaders import CSVLoader

loader = CSVLoader(file_path=r"C:\Users\prana\Desktop\VS_CODE\LangChainModels\DocumentLoaders\housing_prices.csv")

docs = loader.load()

docs[:5]

[Document(metadata={'source': 'C:\\Users\\prana\\Desktop\\VS_CODE\\LangChainModels\\DocumentLoaders\\housing_prices.csv', 'row': 0}, page_content='Area_sqft: 1360\nBedrooms: 1\nBathrooms: 1\nAge: 29\nPrice: 1871387'),
 Document(metadata={'source': 'C:\\Users\\prana\\Desktop\\VS_CODE\\LangChainModels\\DocumentLoaders\\housing_prices.csv', 'row': 1}, page_content='Area_sqft: 1794\nBedrooms: 4\nBathrooms: 3\nAge: 5\nPrice: 2882016'),
 Document(metadata={'source': 'C:\\Users\\prana\\Desktop\\VS_CODE\\LangChainModels\\DocumentLoaders\\housing_prices.csv', 'row': 2}, page_content='Area_sqft: 1630\nBedrooms: 2\nBathrooms: 2\nAge: 21\nPrice: 2434789'),
 Document(metadata={'source': 'C:\\Users\\prana\\Desktop\\VS_CODE\\LangChainModels\\DocumentLoaders\\housing_prices.csv', 'row': 3}, page_content='Area_sqft: 1595\nBedrooms: 5\nBathrooms: 1\nAge: 9\nPrice: 2588091'),
 Document(metadata={'source': 'C:\\Users\\prana\\Desktop\\VS_CODE\\LangChainModels\\DocumentLoaders\\housing_prices.csv', 'row': 4

In [3]:

type(docs)

list

In [4]:
len(docs)

20

In [5]:
docs[0]

Document(metadata={'source': 'C:\\Users\\prana\\Desktop\\VS_CODE\\LangChainModels\\DocumentLoaders\\housing_prices.csv', 'row': 0}, page_content='Area_sqft: 1360\nBedrooms: 1\nBathrooms: 1\nAge: 29\nPrice: 1871387')

In [6]:
print(docs[10].page_content)

Area_sqft: 2635
Bedrooms: 4
Bathrooms: 2
Age: 1
Price: 4239307


In [7]:
docs[12].metadata

{'source': 'C:\\Users\\prana\\Desktop\\VS_CODE\\LangChainModels\\DocumentLoaders\\housing_prices.csv',
 'row': 12}

In [8]:
data_str = "\n\n".join([doc.page_content for doc in docs])
print(data_str)

Area_sqft: 1360
Bedrooms: 1
Bathrooms: 1
Age: 29
Price: 1871387

Area_sqft: 1794
Bedrooms: 4
Bathrooms: 3
Age: 5
Price: 2882016

Area_sqft: 1630
Bedrooms: 2
Bathrooms: 2
Age: 21
Price: 2434789

Area_sqft: 1595
Bedrooms: 5
Bathrooms: 1
Age: 9
Price: 2588091

Area_sqft: 2138
Bedrooms: 4
Bathrooms: 2
Age: 3
Price: 3476812

Area_sqft: 2669
Bedrooms: 1
Bathrooms: 2
Age: 21
Price: 3876747

Area_sqft: 966
Bedrooms: 1
Bathrooms: 2
Age: 28
Price: 1253300

Area_sqft: 1738
Bedrooms: 3
Bathrooms: 1
Age: 17
Price: 2641065

Area_sqft: 830
Bedrooms: 3
Bathrooms: 2
Age: 25
Price: 1237798

Area_sqft: 1982
Bedrooms: 2
Bathrooms: 1
Age: 11
Price: 2952268

Area_sqft: 2635
Bedrooms: 4
Bathrooms: 2
Age: 1
Price: 4239307

Area_sqft: 630
Bedrooms: 4
Bathrooms: 3
Age: 9
Price: 1107185

Area_sqft: 2185
Bedrooms: 3
Bathrooms: 3
Age: 29
Price: 3241204

Area_sqft: 1269
Bedrooms: 4
Bathrooms: 1
Age: 3
Price: 2140279

Area_sqft: 2891
Bedrooms: 4
Bathrooms: 3
Age: 13
Price: 4485599

Area_sqft: 2015
Bedrooms: 1
Bathro

# Using LLMs For Data understanding

In [9]:

from langchain_ollama  import ChatOllama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from pydantic import BaseModel ,Field
from dotenv import load_dotenv
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [10]:
class Code(BaseModel):
    code: str =  Field(description="Python code for data understanding and cleaning")
    explanation: str = Field(description="Explanation of the code in Bulllet points")
    

In [11]:
model = ChatOllama(model="qwen3:latest")
structured_llm = model.with_structured_output(Code)

In [12]:
prompt = PromptTemplate(
    template="""
You are a Data Cleaning and Data Understanding Expert.

Your task:
Generate clean, well-structured Python code using pandas to perform basic data understanding and data cleaning on the given dataset.

Include:
- Displaying first few rows
- Checking data types
- Handling missing values
- Removing duplicates
- Basic summary statistics

Dataset:
{data}

Only return Python code. Do not include explanations.
""",
    input_variables=["data"]
)

In [13]:
response = structured_llm.invoke(prompt.format(data=data_str))

In [14]:
print(print(response.code))

import pandas as pd

# Create DataFrame from the dataset
data = [
    {'Area_sqft': 1360, 'Bedrooms': 1, 'Bathrooms': 1, 'Age': 29, 'Price': 1871387},
    {'Area_sqft': 1794, 'Bedrooms': 4, 'Bathrooms': 3, 'Age': 5, 'Price': 2882016},
    {'Area_sqft': 1630, 'Bedrooms': 2, 'Bathrooms': 2, 'Age': 21, 'Price': 2434789},
    {'Area_sqft': 1595, 'Bedrooms': 5, 'Bathrooms': 1, 'Age': 9, 'Price': 2588091},
    {'Area_sqft': 2138, 'Bedrooms': 4, 'Bathrooms': 2, 'Age': 3, 'Price': 3476812},
    {'Area_sqft': 2669, 'Bedrooms': 1, 'Bathrooms': 2, 'Age': 21, 'Price': 3876747},
    {'Area_sqft': 966, 'Bedrooms': 1, 'Bathrooms': 2, 'Age': 28, 'Price': 1253300},
    {'Area_sqft': 1738, 'Bedrooms': 3, 'Bathrooms': 1, 'Age': 17, 'Price': 2641065},
    {'Area_sqft': 830, 'Bedrooms': 3, 'Bathrooms': 2, 'Age': 25, 'Price': 1237798},
    {'Area_sqft': 1982, 'Bedrooms': 2, 'Bathrooms': 1, 'Age': 11, 'Price': 2952268},
    {'Area_sqft': 2635, 'Bedrooms': 4, 'Bathrooms': 2, 'Age': 1, 'Price': 4239307},
    {

In [None]:
import pandas as pd

# Create DataFrame from provided dataset
df = pd.read_csv(r"C:\Users\prana\Desktop\VS_CODE\LangChainModels\DocumentLoaders\housing_prices.csv")

# Display first few rows
print("First few rows:")
print(df.head())

# Check data types
print("\nData types:")
print(df.dtypes)

# Handle missing values
print("\nMissing values:")
print(df.isnull().sum())

# Remove duplicates
df = df.drop_duplicates()

# Basic summary statistics
print("\nSummary statistics:")
print(df.describe())


First few rows:
   Area_sqft  Bedrooms  Bathrooms  Age    Price
0       1360         1          1   29  1871387
1       1794         4          3    5  2882016
2       1630         2          2   21  2434789
3       1595         5          1    9  2588091
4       2138         4          2    3  3476812

Data types:
Area_sqft    int64
Bedrooms     int64
Bathrooms    int64
Age          int64
Price        int64
dtype: object

Missing values:
Area_sqft    0
Bedrooms     0
Bathrooms    0
Age          0
Price        0
dtype: int64

Summary statistics:
         Area_sqft   Bedrooms  Bathrooms        Age         Price
count    20.000000  20.000000  20.000000  20.000000  2.000000e+01
mean   1862.700000   3.100000   1.950000  14.750000  2.861944e+06
std     685.185192   1.372665   0.759155   8.972912  1.057732e+06
min     630.000000   1.000000   1.000000   1.000000  1.107185e+06
25%    1431.250000   2.000000   1.000000   8.500000  2.240366e+06
50%    1766.000000   3.000000   2.000000  13.500000 