In [7]:
# numpy_data_explorer.py
import numpy as np
import time

print("NumPy Data Explorer - Syntecxhub Internship Week 1 \n")

# 1. Array creation
arr_1d = np.arange(1, 11)
arr_2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
arr_random = np.random.randint(0, 100, size=(5, 5))
zeros = np.zeros((3, 4))
ones = np.ones((4, 3))
linspace_arr = np.linspace(0, 1, 11)

print("1. Array Creation Examples:")
print("1D array:", arr_1d)
print("2D array:\n", arr_2d)
print("Random 5x5:\n", arr_random)
print("Zeros (3x4):\n", zeros)
print("Ones (4x3):\n", ones)
print("Linspace:", linspace_arr)

# 2. Indexing & Slicing
print("\n2. Indexing & Slicing:")
print("arr_2d[1, 2] =", arr_2d[1, 2])
print("First row:", arr_2d[0])
print("Last column:", arr_2d[:, -1])
print("Subarray (rows 0-1, cols 1-2):\n", arr_2d[0:2, 1:3])

# 3. Mathematical & Statistical Operations
print("\n3. Mathematical & Statistical Operations:")
print("Mean of arr_random:", np.mean(arr_random))
print("Std deviation:", np.std(arr_random))
print("Sum axis=0:", np.sum(arr_random, axis=0))
print("Max per row:", np.max(arr_random, axis=1))
print("Element-wise sqrt:\n", np.sqrt(arr_random.astype(float)))

# 4. Reshaping & Broadcasting
print("\n4. Reshaping & Broadcasting:")
# A common and flexible alternative for converting to 1D
flat = arr_random.reshape(25)           # or arr_random.flatten() / .ravel()
print("\n→ Reshaped to 1D (25,):")
print(flat)     #this will reshape the 5x5 array in to 1 1d array of 25 .

reshaped_2 = flat.reshape(5, 5)
print(reshaped_2)    #This will reshape back from 1d to 5x5.
row_vector = np.array([[10, 20, 30, 40, 50]])
broadcast_add = reshaped_2 + row_vector           # Broadcasting
broadcast_mult = reshaped_2 * 2
print("After broadcasting add (first row + [10,20,30,40,50]):\n", broadcast_add)

# 5. Save / Load
np.save('my_array.npy', arr_random)
loaded = np.load('my_array.npy')
print("\n5. Save/Load: Loaded array matches original?", np.array_equal(arr_random, loaded))

# 6. Performance comparison with Python lists
size = 10_000_000
py_list = list(range(size))
np_arr = np.arange(size)

start = time.time()
sum_py = sum(py_list)
end = time.time()
print(f"\nPython list sum: {end-start:.4f} sec")

start = time.time()
sum_np = np_arr.sum()
end = time.time()
print(f"NumPy sum: {end-start:.4f} sec")
print(f"NumPy is ~{sum_py/sum_np:.0f}x faster in this case!")

print("\n=== NumPy Data Explorer Completed ===")

NumPy Data Explorer - Syntecxhub Internship Week 1 

1. Array Creation Examples:
1D array: [ 1  2  3  4  5  6  7  8  9 10]
2D array:
 [[1 2 3]
 [4 5 6]
 [7 8 9]]
Random 5x5:
 [[93 71 14 87 32]
 [86 12 48 51 38]
 [87 92 17 58 88]
 [77 88 56 24 60]
 [65 95 50 82 55]]
Zeros (3x4):
 [[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
Ones (4x3):
 [[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]]
Linspace: [0.  0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. ]

2. Indexing & Slicing:
arr_2d[1, 2] = 6
First row: [1 2 3]
Last column: [3 6 9]
Subarray (rows 0-1, cols 1-2):
 [[2 3]
 [5 6]]

3. Mathematical & Statistical Operations:
Mean of arr_random: 61.04
Std deviation: 26.180114591040276
Sum axis=0: [408 358 185 302 273]
Max per row: [93 86 92 88 95]
Element-wise sqrt:
 [[9.64365076 8.42614977 3.74165739 9.32737905 5.65685425]
 [9.2736185  3.46410162 6.92820323 7.14142843 6.164414  ]
 [9.32737905 9.59166305 4.12310563 7.61577311 9.38083152]
 [8.77496439 9.38083152 7.48331477 4.89897949 7.74596669]
 [8.06225

In [9]:
# pandas_csv_analysis.py
import pandas as pd

# Using a popular public dataset (Titanic)
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

print("Pandas CSV Reader & Basic Analysis - Syntecxhub Week 1 \n")
print("Dataset shape:", df.shape)

# 1. Inspect head/tail/dtypes
print("\nFirst 5 rows:")
print(df.head())
print("\nLast 5 rows:")
print(df.tail())
print("\nData types:\n", df.dtypes)

# 2. Summary statistics
print("\nSummary Statistics:")
print(df.describe(include='all'))

print("\nCustom stats:")
print("Age mean:", df['Age'].mean())
print("Age median:", df['Age'].median())
print("Fare max:", df['Fare'].max())
print("Survived count:\n", df['Survived'].value_counts())

# 3. Filtering, selecting, slicing
print("\nFiltering Examples:")
female_passengers = df[df['Sex'] == 'female']
print("Number of female passengers:", len(female_passengers))

survived_pclass1 = df[(df['Survived'] == 1) & (df['Pclass'] == 1)]
print("Survived in 1st class:", len(survived_pclass1))

# Selecting columns
subset_cols = df[['Name', 'Age', 'Fare', 'Survived']]
print("\nSelected columns sample:")
print(subset_cols.head())

# Slicing rows 100 to 110
print("\nRows 100-110:")
print(df.iloc[100:111])

# 4. Save filtered results
high_fare = df[df['Fare'] > df['Fare'].quantile(0.9)]
high_fare.to_csv('high_fare_passengers.csv', index=False)
high_fare.to_excel('high_fare_passengers.xlsx', index=False)
print(f"\nSaved {len(high_fare)} high-fare passengers to CSV & Excel")

print("\n=== Pandas Analysis Completed Successfully ===")

Pandas CSV Reader & Basic Analysis - Syntecxhub Week 1 

Dataset shape: (891, 12)
First 5 rows:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0    

In [13]:
# data_cleaning_utility.py
import pandas as pd
import numpy as np

# Using a messy sample dataset (or you can use any real messy CSV)
data = {
    'Name': ['  Alice', 'Bob ', 'Charlie', 'David', None, 'Alice'],
    'Age': [25, 'thirty', 35, 40, None, 25],
    ' Salary ': [50000, 60000, None, 70000, 55000, 50000],
    'Join_Date': ['2021-01-15', '2021/02/20', '2021-03-10', 'invalid', '2021-05-01', '2021-01-15'],
    'Department': ['HR', 'IT', 'IT', 'Finance', 'HR', 'HR']
}
df = pd.DataFrame(data)

print("=== Data Cleaning Utility - Syntecxhub Week 1 ===\n")
print("Original Data:")
print(df)
print("\nInitial Info:")
print(df.info())

# Cleaning log
log = []

# 1. Standardize column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
log.append("Standardized column names")

# 2. Handle missing values
df['name'].fillna('Unknown', inplace=True)                                                     #Replaces any NaN (missing) values with the string "Unknown".
df['age'].fillna(df['age'].median() if df['age'].dtype != 'object' else np.nan, inplace=True)
df['salary'].fillna(df['salary'].median(), inplace=True)
log.append(f"Filled missing name with 'Unknown', salary with median")
print("This Is the data after handeling missing values : \n",df)
# 3. Fix incorrect dtypes
# Age: convert to numeric
df['age'] = pd.to_numeric(df['age'], errors='coerce')
df['age'].fillna(df['age'].median(), inplace=True)

# Join date: parse dates
df['join_date'] = pd.to_datetime(df['join_date'], errors='coerce')
log.append("Converted age to numeric & parsed join_date")

# 4. Remove duplicates
initial_rows = len(df)
df.drop_duplicates(inplace=True)
log.append(f"Removed {initial_rows - len(df)} duplicate rows")

# 5. Final cleaned dataset
print("\nCleaned Dataset:")
print(df)
print("\nFinal Info:")
print(df.info())

# Save cleaned data
df.to_csv('cleaned_dataset.csv', index=False)
print(f"\nCleaned dataset saved as 'cleaned_dataset.csv' ({len(df)} rows)")
print("\n=== Data Cleaning Utility Completed ===")

=== Data Cleaning Utility - Syntecxhub Week 1 ===

Original Data:
      Name     Age   Salary    Join_Date Department
0    Alice      25   50000.0  2021-01-15         HR
1     Bob   thirty   60000.0  2021/02/20         IT
2  Charlie      35       NaN  2021-03-10         IT
3    David      40   70000.0     invalid    Finance
4     None    None   55000.0  2021-05-01         HR
5    Alice      25   50000.0  2021-01-15         HR

Initial Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Name        5 non-null      object 
 1   Age         5 non-null      object 
 2    Salary     5 non-null      float64
 3   Join_Date   6 non-null      object 
 4   Department  6 non-null      object 
dtypes: float64(1), object(4)
memory usage: 372.0+ bytes
None
This Is the data after handeling missing values : 
       name     age   salary   join_date department
0  

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['name'].fillna('Unknown', inplace=True)                                                     #Replaces any NaN (missing) values with the string "Unknown".
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(df['age'].median() if df['age'].dtype != 'object' else np.