In [1]:
import pandas as pd

course_name = ['Data Science', 'Machine Learning', 'Big Data', 'Data Engineer']
duration = [2, 3, 6, 4]
df = pd.DataFrame(data={'course_name': course_name, 'duration': duration})

second_row_data = df.iloc[1]
print(second_row_data)

course_name    Machine Learning
duration                      3
Name: 1, dtype: object


In [2]:
"""  In pandas, both `loc` and `iloc` are used for indexing and selecting data from a DataFrame,
    but they have different ways of referencing data
    
    difference between the two:-
    
    1.`loc` (Label-based Indexing):
        >`loc` is used for selecting data based on labels or index values. 
         It uses labels to specify the rows and columns you want to select.
            
    >You can pass row labels and column labels as arguments to `loc`.
    >The syntax is: `df.loc[row_label, column_label]`.
    
    
    2.`iloc` (Integer-based Indexing):
        
       >`iloc` is used for selecting data based on integer positions. 
         It uses integer indices to specify the rows and columns you want to select. 
      > You can pass integer indices (0-based) to `iloc`.
      >The syntax is: `df.iloc[row_index, column_index]`.
    
    
    examples:- """
    
import pandas as pd

data = {'A': [1, 2, 3],
        'B': [4, 5, 6],
        'C': [7, 8, 9]}
df = pd.DataFrame(data, index=['row1', 'row2', 'row3'])

print(df.loc['row2', 'B'])  

print(df.iloc[1, 1])  

print(df.loc['row1':'row2', :])  
print(df.iloc[0:2, :])  

5
5
      A  B  C
row1  1  4  7
row2  2  5  8
      A  B  C
row1  1  4  7
row2  2  5  8


In [3]:
import pandas as pd
import numpy as np


columns = ['column_1', 'column_2', 'column_3', 'column_4', 'column_5', 'column_6']
indices = [1, 2, 3, 4, 5, 6]


df1 = pd.DataFrame(np.random.rand(6, 6), columns=columns, index=indices)


reindex = [3, 0, 1, 2]
new_df = df1.reindex(reindex)


print("Output of new_df.loc[2]:")
print(new_df.loc[2])

print("\nOutput of new_df.iloc[2]:")
print(new_df.iloc[2])

Output of new_df.loc[2]:
column_1    0.278746
column_2    0.035010
column_3    0.802707
column_4    0.588439
column_5    0.969939
column_6    0.623092
Name: 2, dtype: float64

Output of new_df.iloc[2]:
column_1    0.535791
column_2    0.401078
column_3    0.182735
column_4    0.349600
column_5    0.998349
column_6    0.998868
Name: 1, dtype: float64


 differences between the two outputs:-
    
   1.Output of `new_df.loc[2]`:
    
    > This selects the row with the label/index 2 after reindexing.
      However, due to the reindexing, the label/index 2 now corresponds
        to the original row with label/index 1.
        
    >The output contains the data for the row where the original index was 1.
    
    2.Output of `new_df.iloc[2]`:
        
       >This selects the row at integer position 2 after reindexing.
         In this case, the third row after reindexing corresponds to the
         original row with index 2. 
            
        >The output contains the data for the row where the original index was 2.
        
        
    

In [4]:
import pandas as pd
import numpy as np

columns = ['column_1', 'column_2', 'column_3', 'column_4', 'column_5', 'column_6']
indices = [1, 2, 3, 4, 5, 6]


df1 = pd.DataFrame(np.random.rand(6, 6), columns=columns, index=indices)

column_means = df1.mean()
print("Mean of each column:")
print(column_means)

column_2_std = df1['column_2'].std()
print("\nStandard deviation of column 'column_2':")
print(column_2_std)

Mean of each column:
column_1    0.456679
column_2    0.523353
column_3    0.398523
column_4    0.641989
column_5    0.700328
column_6    0.469950
dtype: float64

Standard deviation of column 'column_2':
0.29137518265997864


Explanation:
    
    (i) The code calculates the mean of each column using the. 
         mean() method of the DataFrame.
         It provides the mean value for every column present in the DataFrame.
            
    (ii) The code calculates the standard deviation of the 'column_2' using the .
         std() method applied to that specific column.
         It calculates and outputs the standard deviation value for the specified column.
            
    

In [5]:
import pandas as pd
import numpy as np

columns = ['column_1', 'column_2', 'column_3', 'column_4', 'column_5', 'column_6']
indices = [1,2,3,4,5,6]
df1 = pd.DataFrame(np.random.rand(6,6), columns = columns, index = indices)

df1.loc[2, 'column_2'] = 'string_value'

mean = df1['column_2'].mean()
print("Mean of column 'column_2':")
print(mean)

TypeError: unsupported operand type(s) for +: 'float' and 'str'

  As you can see, the code fails to find the mean of column column_2. 
    This is because the data in the second row of column column_2 is now a string variable,
    and Pandas cannot calculate the mean of a column that contains string variables.
    
    To fix this error, we need to convert the string variable in the second row of column
    column_2 to a numeric variable. We can do this using the astype() method.
    The following code will successfully find the mean of column column_2:
        
        

In [7]:
import pandas as pd
import numpy as np

columns = ['column_1', 'column_2', 'column_3', 'column_4', 'column_5', 'column_6']
indices = [1, 2, 3, 4, 5, 6]

df1 = pd.DataFrame(np.random.rand(6, 6), columns=columns, index=indices)

df1.loc[2, 'column_2'] = -1

column_2_mean = df1['column_2'].mean()
print("Mean of 'column_2' after replacing:", column_2_mean)

Mean of 'column_2' after replacing: 0.21712535255006013


In [9]:
"""   In pandas, a "window function" (also known as "rolling" or "moving" function) is a computation that
    operates on a set of data points (values) within a defined window or range.
    These functions are particularly useful for time-series data and other ordered data where you want to calculate aggregate measures,
     such as moving averages, cumulative sums, or other statistics, over a sliding window.
        
        Window functions are often used to gain insights into trends, patterns,
            or fluctuations in data over time or across sequences.
            They allow you to perform calculations that take into account a specific number of preceding or succeeding data points,
             which helps in smoothing out noise and highlighting underlying trends.
                
    1.Rolling Window Functions:-
    
    >These functions calculate statistics over a sliding window of fixed size that moves through the data.
     You can specify the window size (number of data points) and then calculate various metrics within that window.
        
    >Examples include 'rolling_mean()', 'rolling_sum()', 'rolling_std()', 'rolling_max()', and 'rolling_min()'
    
    2.Expanding Window Functions:-
    
      >Expanding window functions compute statistics for all data points 
        up to a certain point in the dataset, including all preceding data points.
        The window size increases over time.
        
     >Examples include "expanding_mean()", "expanding_sum()", "expanding_std()", "expanding_max()", and "expanding_min()"
    
     3.Exponential Moving Average (EMA):-
        
       >EMA assigns different weights to different data points, giving more importance to recent data.
          It's often used for smoothing data and is a type of rolling window function.
        
        >Example: 'ewm()' 'function with options like span', 'halflife', or 'com.'
        
    4.Window Apply Functions:-
    
      > These functions allow you to apply a custom function to a rolling window of data.
        
      >Example: 'rolling_apply()'.
    5.Window Aggregation Functions:-
    
    
     >These functions aggregate data within a window to produce a single value for each window.
        
     >Example: 'rolling_agg()'."""
    
import pandas as pd
import numpy as np

dates = pd.date_range('2023-01-01', periods=10, freq='D')
data = np.random.randn(10)
df = pd.DataFrame(data, index=dates, columns=['Value'])

rolling_mean = df['Value'].rolling(window=3).mean()

print(rolling_mean)    

2023-01-01         NaN
2023-01-02         NaN
2023-01-03    0.149764
2023-01-04   -0.359306
2023-01-05   -0.628430
2023-01-06   -0.015336
2023-01-07    0.171758
2023-01-08    0.420523
2023-01-09    0.005857
2023-01-10    1.018055
Freq: D, Name: Value, dtype: float64


In [10]:
import pandas as pd

current_date = pd.Timestamp.now()

current_month = current_date.month
current_year = current_date.year

print(f"Current month: {current_month}")
print(f"Current year: {current_year}")

Current month: 8
Current year: 2023


In [12]:
import pandas as pd

def calculate_time_difference(date1, date2):
    try:
        
        timestamp1 = pd.Timestamp(date1)
        timestamp2 = pd.Timestamp(date2)
        
        
        time_difference = timestamp2 - timestamp1
        
        
        days = time_difference.days
        hours, remainder = divmod(time_difference.seconds, 3600)
        minutes, seconds = divmod(remainder, 60)
        
        
        print(f"Time difference: {days} days, {hours} hours, {minutes} minutes")
    except Exception as e:
        print("Error:", e)


date1 = input("Enter the first date (YYYY-MM-DD): ")
date2 = input("Enter the second date (YYYY-MM-DD): ")


calculate_time_difference(date1, date2)

Enter the first date (YYYY-MM-DD):  2023-08-01
Enter the second date (YYYY-MM-DD):  2023-08-31


Time difference: 30 days, 0 hours, 0 minutes


In [13]:
import pandas as pd

def convert_to_categorical(file_path, column_name, category_order):
    try:
        
        df = pd.read_csv(file_path)
        
        
        df[column_name] = pd.Categorical(df[column_name], categories=category_order, ordered=True)
        
        
        sorted_df = df.sort_values(by=[column_name])
        
               
        print(sorted_df)
    except Exception as e:
        print("Error:", e)


file_path = input("Enter the file path (CSV): ")
column_name = input("Enter the column name to convert: ")
category_order = input("Enter the category order (comma-separated): ").split(',')

convert_to_categorical(file_path, column_name, category_order)

Enter the file path (CSV):  sonu
Enter the column name to convert:  f
Enter the category order (comma-separated):  python


Error: [Errno 2] No such file or directory: 'sonu'


In [14]:
import pandas as pd
import matplotlib.pyplot as plt

def visualize_sales(file_path):
    try:
        
        df = pd.read_csv(file_path)
        
        
        pivot_df = df.pivot_table(index='Date', columns='ProductCategory', values='Sales', aggfunc='sum')
        
    
        pivot_df.plot(kind='bar', stacked=True)
        
        
        plt.title('Sales by Product Category Over Time')
        plt.xlabel('Date')
        plt.ylabel('Sales')
        
        
        plt.show()
    except Exception as e:
        print("Error:", e)


file_path = input("Enter the file path (CSV): ")


visualize_sales(file_path)

Enter the file path (CSV):  pankaj


Error: [Errno 2] No such file or directory: 'pankaj'


In [15]:
import pandas as pd
import statistics
from tabulate import tabulate


def calculate_statistics(file_path):
    try:
        
        df = pd.read_csv(file_path)
        
        
        mean_score = df['Test Score'].mean()
        median_score = df['Test Score'].median()
        mode_scores = statistics.multimode(df['Test Score'])
        
               
        results = [
            ["Mean", mean_score],
            ["Median", median_score],
            ["Mode", ', '.join(map(str, mode_scores))]
        ]
        
        print(tabulate(results, headers=["Statistic", "Value"]))
        
    except Exception as e:
        print("Error:", e)


file_path = input("Enter the file path of the CSV file containing the student data: ")


calculate_statistics(file_path)

ModuleNotFoundError: No module named 'tabulate'