In [27]:
# Install required libraries
!pip install easyocr transformers nltk



In [28]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import easyocr
from transformers import TFRobertaForSequenceClassification, RobertaTokenizer
from google.colab import files
import matplotlib.pyplot as plt
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [29]:
# Define language names
LANGUAGES = ['Python', 'Java', 'JavaScript', 'C++', 'Ruby', 'Go', 'PHP', 'Swift', 'Kotlin', 'Rust']

In [30]:
# Step 1: OCR using EasyOCR
def perform_ocr(image_path):
    reader = easyocr.Reader(['en'])
    result = reader.readtext(image_path)
    text = ' '.join([detection[1] for detection in result])
    return text

In [31]:
# Step 2: Preprocess the extracted text
def preprocess_text(text):
    return ' '.join(text.split())

In [32]:
# Step 3: Define the deep learning model
class CodeAnalyzer(keras.Model):
    def __init__(self, num_classes):
        super(CodeAnalyzer, self).__init__()
        self.roberta = TFRobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_classes)
        self.language_classifier = keras.layers.Dense(num_classes, activation='softmax')
        self.feature_extractor = keras.layers.Dense(128, activation='relu')


    def call(self, inputs):
        roberta_output = self.roberta(inputs).logits
        language_probs = self.language_classifier(roberta_output)
        features = self.feature_extractor(roberta_output)
        return language_probs, features

In [33]:
# Step 4: Train the model (pseudo-code, as actual training would require a dataset)
def train_model(model, train_dataset, num_epochs, learning_rate):
    optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=False)

    model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

    history = model.fit(train_dataset, epochs=num_epochs)
    return model, history

In [34]:
# Step 5: Inference
def predict_language(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors="tf", padding=True, truncation=True, max_length=512)
    language_probs, _ = model(inputs)
    predicted_class = tf.argmax(language_probs, axis=1).numpy()[0]
    return LANGUAGES[predicted_class]

In [35]:
# Step 6: Explain key features
def explain_key_features(code):
    features = []
    if 'def ' in code:
        features.append("Contains function definitions")
    if 'class ' in code:
        features.append("Contains class definitions")
    if 'import ' in code:
        features.append("Imports external modules")
    if 'for ' in code or 'while ' in code:
        features.append("Uses loops")
    if 'if ' in code:
        features.append("Contains conditional statements")
    return features

In [36]:
# Step 7: Detect potential errors or best practices violations
def detect_issues(code, language):
    issues = []
    if language == 'Python':
        if 'print ' in code:  # Check for Python 2 style print statements
            issues.append("Consider using print() function for Python 3 compatibility")
        if '    ' in code and '\t' in code:
            issues.append("Mixing spaces and tabs for indentation")
    # Add more language-specific checks here
    return issues

In [37]:
# Main pipeline
def analyze_code(image_path, model, tokenizer):
    # Extract text from image using OCR
    extracted_text = perform_ocr(image_path)

    # Preprocess the extracted text
    preprocessed_text = preprocess_text(extracted_text)

    # Predict the programming language
    language = predict_language(model, tokenizer, preprocessed_text)

    # Explain key features
    features = explain_key_features(preprocessed_text)

    # Detect potential issues
    issues = detect_issues(preprocessed_text, language)

    return extracted_text, language, features, issues

In [None]:
# Example usage
if __name__ == "__main__":
    # Upload an image
    uploaded = files.upload()
    image_path = next(iter(uploaded))

    # Initialize the model and tokenizer
    num_classes = len(LANGUAGES)
    model = CodeAnalyzer(num_classes)
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

    # Analyze the code
    extracted_code, language, features, issues = analyze_code(image_path, model, tokenizer)

    print("Extracted Code:")
    print(extracted_code)
    print(f"\nDetected Programming Language: {language}")
    print("\nKey Features:")
    for feature in features:
        print(f"- {feature}")
    print("\nPotential Issues:")
    for issue in issues:
        print(f"- {issue}")

    # Display the uploaded image
    img = plt.imread(image_path)
    plt.imshow(img)
    plt.axis('off')
    plt.show()