In [None]:
import pandas as pd

# Load the dataset from CSV
data = pd.read_csv("mtsamples.csv")

# Display the first few rows of the dataset
print(data.head())


In [None]:
def preprocess_text(text):
    # Check if the text is not NaN (missing value)
    if isinstance(text, str):
        # Lowercase the text
        text = text.lower()

        # Remove non-alphabetic characters
        text = re.sub(r"[^a-zA-Z]", " ", text)

        # Tokenize the text
        tokens = text.split()

        # Remove stopwords
        stop_words = set(stopwords.words("english"))
        tokens = [word for word in tokens if word not in stop_words]

        # Lemmatize the tokens
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]

        # Join the preprocessed tokens
        preprocessed_text = " ".join(tokens)

        return preprocessed_text
    else:
        return ""

# Apply preprocessing to the 'transcription' column
data["preprocessed_transcription"] = data["transcription"].apply(preprocess_text)

# Display the preprocessed text
print(data["preprocessed_transcription"].head())

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the preprocessed text
tfidf_matrix = vectorizer.fit_transform(data["preprocessed_transcription"])

# Convert the TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Display the TF-IDF matrix
print(tfidf_df.head())



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_df, data["medical_specialty"], test_size=0.2, random_state=42)

# Initialize the SVM classifier
svm = SVC(kernel="linear")

# Train the classifier
svm.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = svm.predict(X_test)

# Evaluate the classifier
report = classification_report(y_test, y_pred,zero_division=1)
print(report)


# Find the best model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_df, data["medical_specialty"], test_size=0.2, random_state=42)

# Initialize and train multiple classifiers
classifiers = {
    "SVM": SVC(kernel="linear"),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier()
}

results = {}  # Dictionary to store classification results

for name, classifier in classifiers.items():
    # Train the classifier
    classifier.fit(X_train, y_train)

    # Predict the labels for the test set
    y_pred = classifier.predict(X_test)

    # Evaluate the classifier
    report = classification_report(y_test, y_pred, output_dict=True)

    # Store the classification results
    results[name] = report

# Compare and select the best classifier based on a chosen evaluation metric
best_classifier = None
best_metric_value = 0.0

for name, result in results.items():
    metric_value = result['accuracy']  # Choose the evaluation metric (e.g., accuracy)

    if metric_value > best_metric_value:
        best_metric_value = metric_value
        best_classifier = name

# Print the results and the best classifier
for name, result in results.items():
    print(f"Classifier: {name}")
    print(classification_report(y_test, y_pred))
    print()

print(f"Best Classifier: {best_classifier}")
