In [3]:
import pandas as pd
import numpy as np


# **Combining Datasets: Concat**

In [4]:
# Example DataFrames
df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df2 = pd.DataFrame({'A': [7, 8], 'B': [9, 10]})

print(df1)
print(df2)



   A  B
0  1  4
1  2  5
2  3  6
   A   B
0  7   9
1  8  10


In [5]:
# Concatenation combines two or more DataFrames along a particular axis (rows by default).
# 'ignore_index=True' ensures that the resulting DataFrame's index is reset to a sequential order.
concat_df = pd.concat([df1, df2], ignore_index=True)
print("Concatenated DataFrame:\n", concat_df)

Concatenated DataFrame:
    A   B
0  1   4
1  2   5
2  3   6
3  7   9
4  8  10


In [7]:
test = pd.read_csv("/content/sample_data/california_housing_test.csv")
test.shape

(3000, 9)

In [8]:
train = pd.read_csv("/content/sample_data/california_housing_train.csv")
train.shape

(17000, 9)

In [9]:
combined = pd.concat([test, train], ignore_index=True)
combined.shape

(20000, 9)

# **Combining Datasets: Merge and Join**

In [10]:

# Merging combines DataFrames based on a common column or index.
df3 = pd.DataFrame({'key': ['A', 'B', 'C'], 'value1': [1, 2, 3]})
df4 = pd.DataFrame({'key': ['A', 'B', 'D'], 'value2': [4, 5, 6]})
print(df3)
print(df4)



  key  value1
0   A       1
1   B       2
2   C       3
  key  value2
0   A       4
1   B       5
2   D       6


In [12]:
# Outer join keeps all rows from both DataFrames, filling missing values with NaN where necessary.
merged_df = pd.merge(df3, df4, on='key', how='outer')
print("\nMerged DataFrame (outer join):\n", merged_df)


Merged DataFrame (outer join):
   key  value1  value2
0   A       1       4
1   B       2       5


In [13]:
# Example DataFrames for joining
# Joining works with DataFrames that have indexes as the common key.
df5 = pd.DataFrame({'value1': [1, 2, 3]}, index=['A', 'B', 'C'])
df6 = pd.DataFrame({'value2': [4, 5, 6]}, index=['A', 'B', 'D'])
print(df5)
print(df6)




   value1
A       1
B       2
C       3
   value2
A       4
B       5
D       6


In [14]:
# Outer join keeps all rows, aligning them by their index.
joined_df = df5.join(df6, how='outer')
print("\nJoined DataFrame:\n", joined_df)


Joined DataFrame:
    value1  value2
A     1.0     4.0
B     2.0     5.0
C     3.0     NaN
D     NaN     6.0


In [45]:
# Example DataFrames
tt1 = pd.DataFrame({'Roll': [1, 2, 3], 'Marks1': [80, 90, 95]})
tt2 = pd.DataFrame({'Roll': [1, 2, 4], 'Marks2': [85, 80, 70]})
print(tt1)
print(tt2)

# task1 -> [combine] the tt1 and tt2 using [merge], [all] students must be present
# task2 -> [combine] the tt1 and tt2 using [join], students who took part in
#          [both] exams should be present

   Roll  Marks1
0     1      80
1     2      90
2     3      95
   Roll  Marks2
0     1      85
1     2      80
2     4      70


In [42]:
#sol1 = pd.merge(tt1, tt2, on='Roll', how='outer',suffixes=('_tt1', '_tt2'))
#sol1

In [46]:
tt1 = tt1.set_index('Roll')
tt2 = tt2.set_index('Roll')


sol2 = tt1.join(tt2,how='inner')
sol2

Unnamed: 0_level_0,Marks1,Marks2
Roll,Unnamed: 1_level_1,Unnamed: 2_level_1
1,80,85
2,90,80


In [36]:

import pandas as pd

# Example DataFrames
tt1 = pd.DataFrame({'Roll': [1, 2, 3], 'Marks1': [80, 90, 95]})
tt2 = pd.DataFrame({'Roll': [1, 2, 4], 'Marks2': [85, 80, 70]})

# Set 'Roll' as the index for both DataFrames
tt1_indexed = tt1.set_index('Roll')
tt2_indexed = tt2.set_index('Roll')

# Join the DataFrames based on the index
result = tt1_indexed.join(tt2_indexed, how='inner')
result

Unnamed: 0_level_0,Marks1,Marks2
Roll,Unnamed: 1_level_1,Unnamed: 2_level_1
1,80,85
2,90,80


# **Aggregation and Grouping**

In [47]:
# Grouping is used to split data into groups and then apply aggregate functions (e.g., sum, mean).
data = {
    'Team': ['A', 'A', 'B', 'B', 'C'],
    'Points': [10, 20, 15, 25, 30],
    'Games': [1, 2, 1, 2, 3]
}
group_df = pd.DataFrame(data)
group_df



Unnamed: 0,Team,Points,Games
0,A,10,1
1,A,20,2
2,B,15,1
3,B,25,2
4,C,30,3


In [48]:
# Group by 'Team' and aggregate using sum for 'Points' and mean for 'Games'.
agg_df = group_df.groupby('Team').agg({'Points': 'sum', 'Games': 'mean'})
print("\nAggregated DataFrame:\n", agg_df)


Aggregated DataFrame:
       Points  Games
Team               
A         30    1.5
B         40    1.5
C         30    3.0


# **Vectorized String Operations**

Vectorized string operations are efficient and apply to entire Series.

In [None]:
# Example Series with strings
names = pd.Series(['Alice', 'Bob', 'Charlie', 'David'])
names



Unnamed: 0,0
0,Alice
1,Bob
2,Charlie
3,David


In [None]:

# Vectorized string operations are efficient and apply to entire Series.
# Convert all strings to lowercase.
lowercase = names.str.lower()
lowercase


Unnamed: 0,0
0,alice
1,bob
2,charlie
3,david


In [None]:
# Check if each string contains the letter 'a' (case insensitive).
contains_a = names.str.contains('a')

print("\nNames containing 'a':\n", contains_a)


Names containing 'a':
 0    False
1    False
2     True
3     True
dtype: bool


# **High-Performance Pandas: eval() and query()**

eval() allows performing arithmetic operations on columns efficiently.

query() allows filtering rows using a string expression.



In [None]:
data = {
    'A': np.random.rand(5),
    'B': np.random.rand(5),
    'C': np.random.rand(5),
}
eval_df = pd.DataFrame(data)
eval_df

Unnamed: 0,A,B,C
0,0.725768,0.879098,0.30232
1,0.083404,0.331023,0.742503
2,0.483491,0.844995,0.075019
3,0.676355,0.304346,0.749677
4,0.227896,0.195614,0.853953


In [None]:
# Create a new column 'D' using eval to calculate 'A + B - C'.
eval_df['D'] = eval_df.eval('A + B - C')
print("\nDataFrame with eval:\n", eval_df)


DataFrame with eval:
           A         B         C         D
0  0.725768  0.879098  0.302320  1.302546
1  0.083404  0.331023  0.742503 -0.328076
2  0.483491  0.844995  0.075019  1.253468
3  0.676355  0.304346  0.749677  0.231024
4  0.227896  0.195614  0.853953 -0.430443


In [None]:
# Here, rows are filtered where 'A > 0.5' and 'B < 0.5'.
filtered_df = eval_df.query('A > 0.5 & B < 0.5')
print("\nFiltered DataFrame with query:\n", filtered_df)


Filtered DataFrame with query:
           A         B         C         D
3  0.676355  0.304346  0.749677  0.231024


pip install pyinstaller

pyinstaller --onefile your_file_name.py