<a href="https://colab.research.google.com/github/Springboard429/LingualSense_Infosys_Internship_Oct2024/blob/Hema/lingualsensedeeplearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import regex as re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.feature_extraction.text import TfidfVectorizer

# Load and preprocess dataset
dataset_path = '/content/sample_data/dataset.csv'
language_detection_path = '/content/sample_data/Language Detection.csv'

# Load datasets
dataset = pd.read_csv(dataset_path)
language_detection = pd.read_csv(language_detection_path)


In [None]:
# Select relevant columns and merge datasets
dataset_selected = dataset[['Text', 'language']]
language_detection_selected = language_detection.rename(columns={'Language': 'language'})[['Text', 'language']]
merged_dataset = pd.concat([dataset_selected, language_detection_selected], ignore_index=True)

# Clean data
merged_dataset.dropna(inplace=True)
merged_dataset.drop_duplicates(inplace=True)

# Remove special characters
def remove_special_characters(text):
    return re.sub(r'[^\w\s]', '', str(text))

merged_dataset['Cleaned_Text'] = merged_dataset['Text'].apply(remove_special_characters)

In [None]:
# Encode target labels
label_encoder = LabelEncoder()
merged_dataset['language_encoded'] = label_encoder.fit_transform(merged_dataset['language'])
num_classes = len(label_encoder.classes_)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = vectorizer.fit_transform(merged_dataset['Cleaned_Text'])

# Train-test split
X = tfidf_matrix
y = to_categorical(merged_dataset['language_encoded'], num_classes=num_classes)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Build the neural network model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train.toarray(), y_train, epochs=10, batch_size=32, validation_split=0.2)


In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test.toarray(), y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

# Save the trained model
model.save('/content/sample_data/language_detection_model.h5')

# Print the first few classes
print("Classes:", label_encoder.classes_)
