In [2]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Data Augmentation parameters
augmentation = ImageDataGenerator(
    rotation_range=10,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    brightness_range=[0.8, 1.2],  # Adjust brightness
    channel_shift_range=20.0,     # Adjust channel shift
    fill_mode='nearest',
)


In [13]:
import os
import cv2
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Define the path to your dataset and where you want to save the augmented images
input_folder = 'dataset'
output_folder = 'augmented_images'

# Ensure output folder exists
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Data augmentation settings
datagen = ImageDataGenerator(
    rotation_range=10,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    fill_mode='nearest'
)

# Loop through each class folder
for class_folder in os.listdir(input_folder):
    class_path = os.path.join(input_folder, class_folder)
    
    # Ensure it's a folder
    if os.path.isdir(class_path):
        # Create a folder for the class in the output directory
        output_class_path = os.path.join(output_folder, class_folder)
        if not os.path.exists(output_class_path):
            os.makedirs(output_class_path)
        
        # Loop until we have 1000 images
        count = 0
        while count < 1000:
            for filename in os.listdir(class_path):
                if count >= 1500:
                    break
                
                # Check if the file is an image
                if filename.endswith(('.jpg', '.jpeg', '.png', '.bmp')):
                    # Load the image
                    img_path = os.path.join(class_path, filename)
                    img = cv2.imread(img_path)
                    
                    # Check if the image is loaded correctly
                    if img is not None:
                        # Resize the image to ensure consistent input size (optional)
                        img = cv2.resize(img, (224, 224))
                        
                        # Expand dimensions to fit the datagen flow function
                        img = np.expand_dims(img, axis=0)
                        
                        # Generate augmented images
                        for batch in datagen.flow(img, batch_size=1, save_to_dir=output_class_path, save_prefix='aug', save_format='jpg'):
                            count += 1
                            break

                        # If you want to use the original image as well
                        # cv2.imwrite(os.path.join(output_class_path, f"original_{filename}"), img)
                        
                        print(f"Generated {count} images for class {class_folder}")
                    else:
                        print(f"Failed to load image {img_path}")

print("Data augmentation completed!")


Generated 1 images for class dew
Generated 2 images for class dew
Generated 3 images for class dew
Generated 4 images for class dew
Generated 5 images for class dew
Generated 6 images for class dew
Generated 7 images for class dew
Generated 8 images for class dew
Generated 9 images for class dew
Generated 10 images for class dew
Generated 11 images for class dew
Generated 12 images for class dew
Generated 13 images for class dew
Generated 14 images for class dew
Generated 15 images for class dew
Generated 16 images for class dew
Generated 17 images for class dew
Generated 18 images for class dew
Generated 19 images for class dew
Generated 20 images for class dew
Generated 21 images for class dew
Generated 22 images for class dew
Generated 23 images for class dew
Generated 24 images for class dew
Generated 25 images for class dew
Generated 26 images for class dew
Generated 27 images for class dew
Generated 28 images for class dew
Generated 29 images for class dew
Generated 30 images for

In [1]:
import os
import random
import shutil

# Define the directory containing your image classes
source_folder = 'augmented_images'

# Define the directory where you want to save the train and test sets
train_folder = 'train'
test_folder = 'test'

# Create train and test folders if they don't exist
if not os.path.exists(train_folder):
    os.makedirs(train_folder)

if not os.path.exists(test_folder):
    os.makedirs(test_folder)

# Define the train-test split ratio
split_ratio = 0.85  # 80% for training, 20% for testing

# Loop through each class folder
for class_folder in os.listdir(source_folder):
    class_path = os.path.join(source_folder, class_folder)
    
    # Check if it's a directory
    if os.path.isdir(class_path):
        images = [f for f in os.listdir(class_path) if f.endswith(('.jpg', '.jpeg', '.png'))]
        
        # Shuffle the images
        random.shuffle(images)
        
        # Split the images into train and test sets
        train_size = int(len(images) * split_ratio)
        train_images = images[:train_size]
        test_images = images[train_size:]
        
        # Create train and test class folders in train and test directories
        train_class_folder = os.path.join(train_folder, class_folder)
        test_class_folder = os.path.join(test_folder, class_folder)
        
        if not os.path.exists(train_class_folder):
            os.makedirs(train_class_folder)
        
        if not os.path.exists(test_class_folder):
            os.makedirs(test_class_folder)
        
        # Copy images to train and test class folders
        for image in train_images:
            src = os.path.join(class_path, image)
            dst = os.path.join(train_class_folder, image)
            shutil.copy(src, dst)
        
        for image in test_images:
            src = os.path.join(class_path, image)
            dst = os.path.join(test_class_folder, image)
            shutil.copy(src, dst)


In [2]:
import pandas as pd
import numpy as np
import plotly.express as px

# Dash Components
import dash
from dash import Dash, html, dcc, Input, Output
#import dash_bootstrap_components as dbc
df = pd.read_csv("dataa.csv")


In [3]:
df.index

RangeIndex(start=0, stop=6599, step=1)

In [4]:
df

Unnamed: 0,job_title,experience_level,employment_type,work_models,Year,employee_residence,salary,salary_currency,salary_in_usd,company_location,company_size
0,Data Engineer,Mid-level,Full-time,Remote,2024,United States,148100,USD,148100,United States,Medium
1,Data Engineer,Mid-level,Full-time,Remote,2024,United States,98700,USD,98700,United States,Medium
2,Data Scientist,Senior-level,Full-time,Remote,2024,United States,140032,USD,140032,United States,Medium
3,Data Scientist,Senior-level,Full-time,Remote,2024,United States,100022,USD,100022,United States,Medium
4,BI Developer,Mid-level,Full-time,On-site,2024,United States,120000,USD,120000,United States,Medium
...,...,...,...,...,...,...,...,...,...,...,...
6594,Staff Data Analyst,Entry-level,Contract,Hybrid,2020,Canada,60000,CAD,44753,Canada,Large
6595,Staff Data Analyst,Executive-level,Full-time,On-site,2020,Nigeria,15000,USD,15000,Canada,Medium
6596,Machine Learning Manager,Senior-level,Full-time,Hybrid,2020,Canada,157000,CAD,117104,Canada,Large
6597,Data Engineer,Mid-level,Full-time,Hybrid,2020,Austria,65000,EUR,74130,Austria,Large


In [16]:
filt = df["job_title"] == "All"
df_filtered = df[filt].copy()
df_filtered = df_filtered["company_location"].value_counts().sort_values(ascending=False).head(5)
df_filtered

Series([], Name: count, dtype: int64)

In [8]:
names

Index(['On-site', 'Remote', 'Hybrid'], dtype='object', name='work_models')

In [11]:
df_filtered = df["job_title"].value_counts().sort_values(ascending=False).head(10)[::-1]
df_filtered

job_title
Applied Scientist              97
ML Engineer                   113
Research Engineer             136
Data Architect                176
Research Scientist            206
Analytics Engineer            246
Machine Learning Engineer     629
Data Analyst                  910
Data Scientist               1243
Data Engineer                1307
Name: count, dtype: int64

In [17]:
def create_bar_experince(year, job):
    chart_title = "Expected Salary Per Experience Level"
    df_filtered = ""
    if year == "all":
        df_filtered = df.copy()

    else:
        filt = df["Year"] == year
        df_filtered = df[filt].copy()

    if job == "All":
        df_filtered = df_filtered.copy()

    else:
        filt = df_filtered["job_title"] == job
        df_filtered = df_filtered[filt].copy()

    df_filtered = df_filtered.groupby("experience_level")["salary_in_usd"].mean().sort_values(ascending=False)
    return df_filtered


In [23]:
create_bar_experince('All', "AI Developer")

Series([], Name: salary_in_usd, dtype: float64)

In [24]:
fig_experience = px.scatter(df_filtered,
                                    x=df_filtered.index,
                                    y=df_filtered,
                                    color=df_filtered.index,
                                    size=df_filtered,
                                    color_discrete_sequence=["#FCDDB0", "#FF9F9F", "#EDD2F3"],
                                    template="plotly_dark",
                                    labels={"y": "AVG Salary",
                                            "experience_level": "Experience Level"},
                                    opacity=0.8,
                                    )

ValueError: Cannot accept list of column references or list of columns for both `x` and `y`.